From 5a097067675444f54401846d040001d4fe9a6b81 Mon Sep 17 00:00:00 2001 From: ggabernet Date: Tue, 14 Jul 2020 10:56:41 +0200 Subject: [PATCH 001/200] getting functions into groovy libs --- lib/Checks.groovy | 36 ++ lib/Completion.groovy | 119 ++++++ lib/Headers.groovy | 39 ++ lib/Schema.groovy | 128 ++++++ main.nf | 555 ++----------------------- modules/local/check_samplesheet.nf | 34 ++ modules/local/get_software_versions.nf | 24 ++ modules/local/output_documentation.nf | 18 + modules/nf-core/fastqc.nf | 32 ++ modules/nf-core/multiqc.nf | 36 ++ 10 files changed, 509 insertions(+), 512 deletions(-) create mode 100644 lib/Checks.groovy create mode 100644 lib/Completion.groovy create mode 100644 lib/Headers.groovy create mode 100644 lib/Schema.groovy create mode 100644 modules/local/check_samplesheet.nf create mode 100644 modules/local/get_software_versions.nf create mode 100644 modules/local/output_documentation.nf create mode 100644 modules/nf-core/fastqc.nf create mode 100644 modules/nf-core/multiqc.nf diff --git a/lib/Checks.groovy b/lib/Checks.groovy new file mode 100644 index 0000000000..0c912c401b --- /dev/null +++ b/lib/Checks.groovy @@ -0,0 +1,36 @@ +/* + * This file holds several functions used to perform standard checks for the nf-core pipeline template. + */ + +class Checks { + + static void aws_batch(workflow, params) { + if (workflow.profile.contains('awsbatch')) { + assert !params.awsqueue || !params.awsregion : "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" + // Check outdir paths to be S3 buckets if running on AWSBatch + // related: https://github.com/nextflow-io/nextflow/issues/813 + assert !params.outdir.startsWith('s3:') : "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" + // Prevent trace files to be stored on S3 since S3 does not support rolling files. + assert params.tracedir.startsWith('s3:') : "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles." + } + } + + static void hostname(workflow, params, log) { + Map colors = Headers.log_colours(params.monochrome_logs) + if (params.hostnames) { + def hostname = "hostname".execute().text.trim() + params.hostnames.each { prof, hnames -> + hnames.each { hname -> + if (hostname.contains(hname) && !workflow.profile.contains(prof)) { + log.info "=${colors.yellow}====================================================${colors.reset}=\n" + + "${colors.yellow}WARN: You are running with `-profile $workflow.profile`\n" + + " but your machine hostname is ${colors.white}'$hostname'${colors.reset}.\n" + + " ${colors.yellow_bold}Please use `-profile $prof${colors.reset}`\n" + + "=${colors.yellow}====================================================${colors.reset}=" + } + } + } + } + } +} + diff --git a/lib/Completion.groovy b/lib/Completion.groovy new file mode 100644 index 0000000000..0a7a2b555d --- /dev/null +++ b/lib/Completion.groovy @@ -0,0 +1,119 @@ +/* + * Functions to be run on completion of pipeline + */ + +class Completion { + static void email(workflow, params, summary, run_name, baseDir, multiqc_report, log) { + + // Set up the e-mail variables + def subject = "[$workflow.manifest.name] Successful: $workflow.runName" + if (!workflow.success) { + subject = "[$workflow.manifest.name] FAILED: $workflow.runName" + } + + def email_fields = [:] + email_fields['version'] = workflow.manifest.version + email_fields['runName'] = run_name ?: workflow.runName + email_fields['success'] = workflow.success + email_fields['dateComplete'] = workflow.complete + email_fields['duration'] = workflow.duration + email_fields['exitStatus'] = workflow.exitStatus + email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') + email_fields['errorReport'] = (workflow.errorReport ?: 'None') + email_fields['commandLine'] = workflow.commandLine + email_fields['projectDir'] = workflow.projectDir + email_fields['summary'] = summary + email_fields['summary']['Date Started'] = workflow.start + email_fields['summary']['Date Completed'] = workflow.complete + email_fields['summary']['Pipeline script file path'] = workflow.scriptFile + email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId + if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository + if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId + if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision + email_fields['summary']['Nextflow Version'] = workflow.nextflow.version + email_fields['summary']['Nextflow Build'] = workflow.nextflow.build + email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp + + // TODO nf-core: If not using MultiQC, strip out this code (including params.max_multiqc_email_size) + // On success try attach the multiqc report + def mqc_report = null + try { + if (workflow.success) { + mqc_report = multiqc_report.getVal() + if (mqc_report.getClass() == ArrayList) { + log.warn "[$workflow.manifest.name] Found multiple reports from process 'MULTIQC', will use only one" + mqc_report = mqc_report[0] + } + } + } catch (all) { + log.warn "[$workflow.manifest.name] Could not attach MultiQC report to summary email" + } + + // Check if we are only sending emails on failure + def email_address = params.email + if (!params.email && params.email_on_fail && !workflow.success) { + email_address = params.email_on_fail + } + + // Render the TXT template + def engine = new groovy.text.GStringTemplateEngine() + def tf = new File("$baseDir/assets/email_template.txt") + def txt_template = engine.createTemplate(tf).make(email_fields) + def email_txt = txt_template.toString() + + // Render the HTML template + def hf = new File("$baseDir/assets/email_template.html") + def html_template = engine.createTemplate(hf).make(email_fields) + def email_html = html_template.toString() + + // Render the sendmail template + def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$baseDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] + def sf = new File("$baseDir/assets/sendmail_template.txt") + def sendmail_template = engine.createTemplate(sf).make(smail_fields) + def sendmail_html = sendmail_template.toString() + + // Send the HTML e-mail + if (email_address) { + try { + if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } + // Try to send HTML e-mail using sendmail + [ 'sendmail', '-t' ].execute() << sendmail_html + log.info "[$workflow.manifest.name] Sent summary e-mail to $email_address (sendmail)" + } catch (all) { + // Catch failures and try with plaintext + def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] + if ( mqc_report.size() <= params.max_multiqc_email_size.toBytes() ) { + mail_cmd += [ '-A', mqc_report ] + } + mail_cmd.execute() << email_html + log.info "[$workflow.manifest.name] Sent summary e-mail to $email_address (mail)" + } + } + + // Write summary e-mail HTML to a file + def output_d = new File("${params.outdir}/pipeline_info/") + if (!output_d.exists()) { + output_d.mkdirs() + } + def output_hf = new File(output_d, "pipeline_report.html") + output_hf.withWriter { w -> w << email_html } + def output_tf = new File(output_d, "pipeline_report.txt") + output_tf.withWriter { w -> w << email_txt } + } + + static void summary(workflow, params, log) { + Map colors = Headers.log_colours(params.monochrome_logs) + if (workflow.stats.ignoredCount > 0 && workflow.success) { + log.info "-${colors.purple}Warning, pipeline completed, but with errored process(es) ${colors.reset}-" + log.info "-${colors.red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${colors.reset}-" + log.info "-${colors.green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${colors.reset}-" + } + if (workflow.success) { + log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-" + } else { + Checks.hostname() + log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed with errors${colors.reset}-" + } + } +} + diff --git a/lib/Headers.groovy b/lib/Headers.groovy new file mode 100644 index 0000000000..447141125e --- /dev/null +++ b/lib/Headers.groovy @@ -0,0 +1,39 @@ +/* + * This file holds several functions used to render the nf-core ANSI header. + */ + +class Headers { + + private static Map log_colours(Boolean monochrome_logs) { + Map colorcodes = [:] + colorcodes['reset'] = monochrome_logs ? '' : "\033[0m" + colorcodes['dim'] = monochrome_logs ? '' : "\033[2m" + colorcodes['black'] = monochrome_logs ? '' : "\033[0;30m" + colorcodes['green'] = monochrome_logs ? '' : "\033[0;32m" + colorcodes['yellow'] = monochrome_logs ? '' : "\033[0;33m" + colorcodes['yellow_bold'] = monochrome_logs ? '' : "\033[1;93m" + colorcodes['blue'] = monochrome_logs ? '' : "\033[0;34m" + colorcodes['purple'] = monochrome_logs ? '' : "\033[0;35m" + colorcodes['cyan'] = monochrome_logs ? '' : "\033[0;36m" + colorcodes['white'] = monochrome_logs ? '' : "\033[0;37m" + colorcodes['red'] = monochrome_logs ? '' : "\033[1;91m" + return colorcodes + } + + static String nf_core(workflow, monochrome_logs) {x + Map colors = log_colours(monochrome_logs) + String.format( + """\n + -${colors.dim}----------------------------------------------------${colors.reset}- + ${colors.green},--.${colors.black}/${colors.green},-.${colors.reset} + ${colors.blue} ___ __ __ __ ___ ${colors.green}/,-._.--~\'${colors.reset} + ${colors.blue} |\\ | |__ __ / ` / \\ |__) |__ ${colors.yellow}} {${colors.reset} + ${colors.blue} | \\| | \\__, \\__/ | \\ |___ ${colors.green}\\`-._,-`-,${colors.reset} + ${colors.green}`._,._,\'${colors.reset} + ${colors.purple} ${workflow.manifest.name} v${workflow.manifest.version}${colors.reset} + -${colors.dim}----------------------------------------------------${colors.reset}- + """.stripIndent() + ) + } +} + diff --git a/lib/Schema.groovy b/lib/Schema.groovy new file mode 100644 index 0000000000..f0a6ad148d --- /dev/null +++ b/lib/Schema.groovy @@ -0,0 +1,128 @@ +/* + * This file holds several functions used to perform JSON parameter validation, help and summary rendering for the nf-core pipeline template. + */ + +import groovy.json.JsonSlurper + +class JSON { + /* + * This method tries to read a JSON params file + */ + private static LinkedHashMap params_get(String path) { + def usage = new LinkedHashMap() + try { + usage = params_try(path) + } catch (Exception e) { + println "Could not read parameters settings from JSON. $e" + usage = new LinkedHashMap() + } + return usage + } + + /* + Method to actually read in JSON file using Groovy. + Group (as Key), values are all parameters + - Parameter1 as Key, Description as Value + - Parameter2 as Key, Description as Value + .... + Group + - + */ + private static LinkedHashMap params_try(String path) throws Exception { + + def json = new File(path).text + def Map usage = (Map) new JsonSlurper().parseText(json).get('properties') + + /* Tree looks like this in nf-core schema + * properties <- this is what the first get('properties') gets us + group 1 + properties + description + group 2 + properties + description + group 3 + properties + description + */ + def output_map = new LinkedHashMap() + + // Lets go deeper + usage.each { key, val -> + def Map submap = usage."$key".properties // Gets the property object of the group + def sub_params = new LinkedHashMap() + submap.each { innerkey, value -> + sub_params.put("$innerkey", "$value.description") + } + output_map.put("$key", sub_params) + } + return output_map + } + + static String params_help(path, command) { + String output = "Typical pipeline command:\n\n" + output += " ${command}\n\n" + output += params_beautify(params_get(path)) + } + + static String params_beautify(usage) { + String output = "" + for (group in usage.keySet()) { + output += group + "\n" + def params = usage.get(group) // This gets the parameters of that particular group + for (par in params.keySet()) { + output+= " \u001B[1m" + par.padRight(27) + "\u001B[1m" + params.get(par) + "\n" + } + output += "\n" + } + return output + } + + private static LinkedHashMap params_summary(workflow, params, run_name) { + def Map summary = [:] + if (workflow.revision) summary['Pipeline Release'] = workflow.revision + summary['Run Name'] = run_name ?: workflow.runName + // TODO nf-core: Report custom parameters here + summary['Input'] = params.input + summary['Fasta File'] = params.fasta + summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" + if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" + summary['Output dir'] = params.outdir + summary['Launch dir'] = workflow.launchDir + summary['Working dir'] = workflow.workDir + summary['Script dir'] = workflow.projectDir + summary['User'] = workflow.userName + if (workflow.profile.contains('awsbatch')) { + summary['AWS Region'] = params.awsregion + summary['AWS Queue'] = params.awsqueue + summary['AWS CLI'] = params.awscli + } + summary['Config Profile'] = workflow.profile + if (params.config_profile_description) summary['Config Profile Descr'] = params.config_profile_description + if (params.config_profile_contact) summary['Config Profile Contact'] = params.config_profile_contact + if (params.config_profile_url) summary['Config Profile URL'] = params.config_profile_url + summary['Config Files'] = workflow.configFiles.join(', ') + if (params.email || params.email_on_fail) { + summary['E-mail Address'] = params.email + summary['E-mail on failure'] = params.email_on_fail + summary['MultiQC maxsize'] = params.max_multiqc_email_size + } + return summary + } + + static String params_mqc_summary(summary) { + String yaml_file_text = """ + id: 'nf-core-tcrseq-summary' + description: " - this information is collected when the pipeline is started." + section_name: 'nf-core/tcrseq Workflow Summary' + section_href: 'https://github.com/nf-core/tcrseq' + plot_type: 'html' + data: | +
+ ${summary.collect { k,v -> "
$k
${v ?: 'N/A'}
" }.join("\n")} +
+ """.stripIndent() + + return yaml_file_text + } +} diff --git a/main.nf b/main.nf index acbc361392..7a499dfeae 100644 --- a/main.nf +++ b/main.nf @@ -19,355 +19,18 @@ nf-core/sarek: -------------------------------------------------------------------------------- */ -def helpMessage() { - log.info nfcoreHeader() - log.info""" - - Usage: - - The typical command for running the pipeline is as follows: - - nextflow run nf-core/sarek --input sample.tsv -profile docker - - Mandatory arguments: - -profile [str] Configuration profile to use - Can use multiple (comma separated) - Available: conda, docker, singularity, test and more - --input [file] Path to input TSV file on mapping, prepare_recalibration, recalibrate, variant_calling and Control-FREEC steps - Multiple TSV files can be specified with quotes - Works also with the path to a directory on mapping step with a single germline sample only - Alternatively, path to VCF input file on annotate step - Multiple VCF files can be specified with quotes - --step [list] Specify starting step (only one) - Available: mapping, prepare_recalibration, recalibrate, variant_calling, annotate, Control-FREEC - Default: ${params.step} - --genome [str] Name of iGenomes reference - Default: ${params.genome} - - Main options: - --help [bool] You're reading it - --no_intervals [bool] Disable usage of intervals - Intervals are part of the genome chopped up, used to speed up preprocessing and variant calling - --nucleotides_per_second [int] To estimate interval size - Default: ${params.nucleotides_per_second} - --sentieon [bool] If sentieon is available, will enable it for Preprocessing, and Variant Calling - Adds the following options for --tools: DNAseq, DNAscope and TNscope - --skip_qc [str] Specify which QC tools to skip when running Sarek (multiple separated with commas) - Available: all, bamQC, BaseRecalibrator, BCFtools, Documentation - FastQC, MultiQC, samtools, vcftools, versions - Default: None - --target_bed [file] Target BED file for whole exome or targeted sequencing - Default: None - --tools [str] Specify tools to use for variant calling (multiple separated with commas): - Available: ASCAT, CNVkit, ControlFREEC, FreeBayes, HaplotypeCaller - Manta, mpileup, MSIsensor, Mutect2, Strelka, TIDDIT - and/or for annotation: - snpEff, VEP, merge - Default: None - - Modify fastqs (trim/split): - --trim_fastq [bool] Run Trim Galore - --clip_r1 [int] Instructs Trim Galore to remove bp from the 5' end of read 1 (or single-end reads) - --clip_r2 [int] Instructs Trim Galore to remove bp from the 5' end of read 2 (paired-end reads only) - --three_prime_clip_r1 [int] Instructs Trim Galore to remove bp from the 3' end of read 1 AFTER adapter/quality trimming has been performed - --three_prime_clip_r2 [int] Instructs Trim Galore to remove bp from the 3' end of read 2 AFTER adapter/quality trimming has been performed - --trim_nextseq [int] Instructs Trim Galore to apply the --nextseq=X option, to trim based on quality after removing poly-G tails - --save_trimmed [bool] Save trimmed FastQ file intermediates - --split_fastq [int] Specify how many reads should be contained in the split fastq file - Default: no split - - Preprocessing: - --markdup_java_options [str] Establish values for markDuplicates memory consumption - Default: ${params.markdup_java_options} - --no_gatk_spark [bool] Disable usage of GATK Spark implementation of their tools in local mode - --save_bam_mapped [bool] Save Mapped BAMs - --skip_markduplicates [bool] Skip MarkDuplicates - - Variant Calling: - --ascat_ploidy [int] Use this parameter to overwrite default behavior from ASCAT regarding ploidy - Requires that --ascat_purity is set - --ascat_purity [int] Use this parameter to overwrite default behavior from ASCAT regarding purity - Requires that --ascat_ploidy is set - --cf_coeff [str] Control-FREEC coefficientOfVariation - Default: ${params.cf_coeff} - --cf_ploidy [int] Control-FREEC ploidy - Default: ${params.cf_ploidy} - --cf_window [int] Control-FREEC window size - Default: Disabled - --ignore_soft_clipped_bases [bool] Do not analyze soft clipped bases in the reads for GATK Mutect2 - --no_gvcf [bool] No g.vcf output from GATK HaplotypeCaller - --no_strelka_bp [bool] Will not use Manta candidateSmallIndels for Strelka (not recommended by Best Practices) - --pon [file] Panel-of-normals VCF (bgzipped) for GATK Mutect2 / Sentieon TNscope - See: https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_mutect_CreateSomaticPanelOfNormals.php - --pon_index [file] Index of pon panel-of-normals VCF - If none provided, will be generated automatically from the PON - - Annotation: - --annotate_tools [str] Specify from which tools Sarek should look for VCF files to annotate, only for step Annotate - Available: HaplotypeCaller, Manta, Mutect2, Strelka, TIDDIT - Default: None - --annotation_cache [bool] Enable the use of cache for annotation, to be used with --snpeff_cache and/or --vep_cache - --snpeff_cache [file] Specity the path to snpEff cache, to be used with --annotation_cache - --vep_cache [file] Specity the path to VEP cache, to be used with --annotation_cache - --cadd_cache [bool] Enable CADD cache - --cadd_indels [file] Path to CADD InDels file - --cadd_indels_tbi [file] Path to CADD InDels index - --cadd_wg_snvs [file] Path to CADD SNVs file - --cadd_wg_snvs_tbi [file] Path to CADD SNVs index - --genesplicer [file] Enable genesplicer within VEP - - References options: - --igenomes_base [file] Specify base path to AWS iGenomes - Default: ${params.igenomes_base} - --igenomes_ignore [bool] Do not use AWS iGenomes. Will load genomes.config instead of igenomes.config - --genomes_base [file] Specify base path to reference genome - --save_reference [bool] Save built references - - References: If not specified in the configuration file or you wish to overwrite any of the references. - --ac_loci [file] Loci file for ASCAT - --ac_loci_gc [file] Loci GC file for ASCAT - --bwa [file] BWA indexes - If none provided, will be generated automatically from the fasta reference - --chr_dir [file] Chromosomes folder - --chr_length [file] Chromosomes length file - --dbsnp [file] Dbsnp file - --dbsnp_index [file] Dbsnp index - If none provided, will be generated automatically if a dbsnp file is provided - --dict [file] Fasta dictionary file - If none provided, will be generated automatically from the fasta reference - --fasta [file] Fasta reference - --fasta_fai [file] Fasta reference index - If none provided, will be generated automatically from the fasta reference - --germline_resource [file] Germline Resource File for GATK Mutect2 - --germline_resource_index Germline Resource Index for GATK Mutect2 - [file] if none provided, will be generated automatically if a germlineResource file is provided - --intervals [file] Intervals - If none provided, will be generated automatically from the fasta reference - Use --no_intervals to disable automatic generation - --known_indels [file] Known indels file - --known_indels_index [file] Known indels index - If none provided, will be generated automatically if a knownIndels file is provided - --mappability [file] Mappability file for Control-FREEC - --snpeff_db [str] snpEff Database version - --species [str] Species for VEP - --vep_cache_version [int] VEP cache version - - Other options: - --outdir [file] Output directory where the results will be saved - --publish_dir_mode [list] Specify mode of publishing data in the output directory (only one) - Available: symlink, rellink, link, copy, copyNoFollow, move - Default: ${params.publish_dir_mode} - --sequencing_center [str] Name of sequencing center to be displayed in BAM file - --multiqc_config [file] Specify a custom config file for MultiQC - --monochrome_logs [bool] Logs will be without colors - --email [str] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits - --email_on_fail [str] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you if the workflow fails - --plaintext_email [bool] Enable plaintext email - --max_multiqc_email_size [str] Theshold size for MultiQC report to be attached in notification email - If file generated by pipeline exceeds the threshold, it will not be attached - Default: ${params.max_multiqc_email_size} - -name [str] Name for the pipeline run - If not specified, Nextflow will automatically generate a random mnemonic - - AWSBatch options: - --awsqueue [str] The AWSBatch JobQueue that needs to be set when running on AWSBatch - --awsregion [str] The AWS Region for your AWSBatch job to run on - --awscli [str] Path to the AWS CLI tool - """.stripIndent() -} - -// Show help message -if (params.help) exit 0, helpMessage() +nextflow.preview.dsl = 2 /* -================================================================================ - HANDLE OLD PARAMS -================================================================================ -*/ - -// Warnings for deprecated params - -params.annotateTools = null -if (params.annotateTools) { - log.warn "The params `--annotateTools` is deprecated -- it will be removed in a future release." - log.warn "\tPlease check: https://nf-co.re/sarek/docs/usage.md#--annotate_tools" - params.annotate_tools = params.annotateTools -} - -params.annotateVCF = null -if (params.annotateVCF) { - log.warn "The params `--annotateVCF` is deprecated -- it will be removed in a future release." - log.warn "\tPlease check: https://nf-co.re/sarek/docs/usage.md#--input" - input = params.annotateVCF -} - -params.cadd_InDels = null -if (params.cadd_InDels) { - log.warn "The params `--cadd_InDels is deprecated -- it will be removed in a future release." - log.warn "\tPlease check: https://nf-co.re/sarek/docs/usage.md#--cadd_indels" - params.cadd_indels = params.cadd_InDels -} - -params.cadd_InDels_tbi = null -if (params.cadd_InDels_tbi) { - log.warn "The params `--cadd_InDels_tbi is deprecated -- it will be removed in a future release." - log.warn "\tPlease check: https://nf-co.re/sarek/docs/usage.md#--cadd_indels_tbi" - params.cadd_indels_tbi = params.cadd_InDels_tbi -} - -params.cadd_WG_SNVs = null -if (params.cadd_WG_SNVs) { - log.warn "The params `--cadd_WG_SNVs is deprecated -- it will be removed in a future release." - log.warn "\tPlease check: https://nf-co.re/sarek/docs/usage.md#--cadd_wg_snvs" - params.cadd_wg_snvs = params.cadd_WG_SNVs -} - -params.cadd_WG_SNVs_tbi = null -if (params.cadd_WG_SNVs_tbi) { - log.warn "The params `--cadd_WG_SNVs_tbi is deprecated -- it will be removed in a future release." - log.warn "\tPlease check: https://nf-co.re/sarek/docs/usage.md#--cadd_wg_snvs_tbi" - params.cadd_wg_snvs_tbi = params.cadd_WG_SNVs_tbi -} - -params.maxMultiqcEmailFileSize = null -if (params.maxMultiqcEmailFileSize) { - log.warn "The params `--maxMultiqcEmailFileSize` is deprecated -- it will be removed in a future release." - log.warn "\tPlease check: https://nf-co.re/sarek/docs/usage.md#--max_multiqc_email_size" - params.max_multiqc_email_size = params.maxMultiqcEmailFileSize -} - -params.noGVCF = null -if (params.noGVCF) { - log.warn "The params `--noGVCF` is deprecated -- it will be removed in a future release." - log.warn "\tPlease check: https://nf-co.re/sarek/docs/usage.md#--no_gvcf" - params.no_gvcf = params.noGVCF -} - -params.noReports = null -if (params.noReports) { - log.warn "The params `--noReports` is deprecated -- it will be removed in a future release." - log.warn "\tPlease check: https://nf-co.re/sarek/docs/usage.md#--skip_qc" - params.skip_qc = 'all' -} - -params.noStrelkaBP = null -if (params.noStrelkaBP) { - log.warn "The params `--noStrelkaBP` is deprecated -- it will be removed in a future release." - log.warn "\tPlease check: https://nf-co.re/sarek/docs/usage.md#--no_strelka_bp" - params.no_strelka_bp = params.noStrelkaBP + * Print help message if required + */ +if (params.help) { + def command = "nextflow run nf-core/sarek --input sample.tsv -profile docker" + log.info Headers.nf_core(workflow, params.monochrome_logs) + log.info Schema.params_help("$baseDir/nextflow_schema.json", command) + exit 0 } -params.nucleotidesPerSecond = null -if (params.nucleotidesPerSecond) { - log.warn "The params `--nucleotidesPerSecond` is deprecated -- it will be removed in a future release." - log.warn "\tPlease check: https://nf-co.re/sarek/docs/usage.md#--nucleotides_per_second" - params.nucleotides_per_second = params.nucleotidesPerSecond -} - -params.publishDirMode = null -if (params.publishDirMode) { - log.warn "The params `--publishDirMode` is deprecated -- it will be removed in a future release." - log.warn "\tPlease check: https://nf-co.re/sarek/docs/usage.md#--publish_dir_mode" - params.publish_dir_mode = params.publishDirMode -} - -params.sample = null -if (params.sample) { - log.warn "The params `--sample` is deprecated -- it will be removed in a future release." - log.warn "\tPlease check: https://nf-co.re/sarek/docs/usage.md#--input" - params.input = params.sample -} - -params.sampleDir = null -if (params.sampleDir) { - log.warn "The params `--sampleDir` is deprecated -- it will be removed in a future release." - log.warn "\tPlease check: https://nf-co.re/sarek/docs/usage.md#--input" - params.input = params.sampleDir -} - -params.saveGenomeIndex = null -if (params.saveGenomeIndex) { - log.warn "The params `--saveGenomeIndex` is deprecated -- it will be removed in a future release." - log.warn "\tPlease check: https://nf-co.re/sarek/docs/usage.md#--save_reference" - params.save_reference = params.saveGenomeIndex -} - -params.skipQC = null -if (params.skipQC) { - log.warn "The params `--skipQC` is deprecated -- it will be removed in a future release." - log.warn "\tPlease check: https://nf-co.re/sarek/docs/usage.md#--skip_qc" - params.skip_qc = params.skipQC -} - -params.snpEff_cache = null -if (params.snpEff_cache) { - log.warn "The params `--snpEff_cache` is deprecated -- it will be removed in a future release." - log.warn "\tPlease check: https://nf-co.re/sarek/docs/usage.md#--snpeff_cache" - params.snpeff_cache = params.snpEff_cache -} - -params.targetBed = null -if (params.targetBed) { - log.warn "The params `--targetBed` is deprecated -- it will be removed in a future release." - log.warn "\tPlease check: https://nf-co.re/sarek/docs/usage.md#--target_bed" - params.target_bed = params.targetBed -} - -// Errors for removed params - -params.acLoci = null -if (params.acLoci) exit 1, "The params `--acLoci` has been removed.\n\tPlease check: https://nf-co.re/sarek/docs/usage.md#--ac_loci" - -params.acLociGC = null -if (params.acLociGC) exit 1, "The params `--acLociGC` has been removed.\n\tPlease check: https://nf-co.re/sarek/docs/usage.md#--ac_loci_gc" - -params.bwaIndex = null -if (params.bwaIndex) exit 1, "The params `--bwaIndex` has been removed.\n\tPlease check: https://nf-co.re/sarek/docs/usage.md#--bwa" - -params.chrDir = null -if (params.chrDir) exit 1, "The params `--chrDir` has been removed.\n\tPlease check: https://nf-co.re/sarek/docs/usage.md#--chr_dir" - -params.chrLength = null -if (params.chrLength) exit 1, "The params `--chrLength` has been removed.\n\tPlease check: https://nf-co.re/sarek/docs/usage.md#--chr_length" - -params.dbsnpIndex = null -if (params.dbsnpIndex) exit 1, "The params `--dbsnpIndex` has been removed.\n\tPlease check: https://nf-co.re/sarek/docs/usage.md#--dbsnp_index" - -params.fastaFai = null -if (params.fastaFai) exit 1, "The params `--fastaFai` has been removed.\n\tPlease check: https://nf-co.re/sarek/docs/usage.md#--fasta_fai" - -params.genomeDict = null -if (params.genomeDict) exit 1, "The params `--genomeDict` has been removed.\n\tPlease check: https://nf-co.re/sarek/docs/usage.md#--dict" - -params.genomeFile = null -if (params.genomeFile) exit 1, "The params `--genomeFile` has been removed.\n\tPlease check: https://nf-co.re/sarek/docs/usage.md#--fasta" - -params.genomeIndex = null -if (params.genomeIndex) exit 1, "The params `--genomeIndex` has been removed.\n\tPlease check: https://nf-co.re/sarek/docs/usage.md#--fasta_fai" - -params.germlineResource = null -if (params.germlineResource) exit 1, "The params `--germlineResource` has been removed.\n\tPlease check: https://nf-co.re/sarek/docs/usage.md#--germline_resource" - -params.germlineResourceIndex = null -if (params.germlineResourceIndex) exit 1, "The params `--germlineResourceIndex` has been removed.\n\tPlease check: https://nf-co.re/sarek/docs/usage.md#--germline_resource_index" - -params.igenomesIgnore = null -if (params.igenomesIgnore) exit 1, "The params `--igenomesIgnore` has been removed.\n\tPlease check: https://nf-co.re/sarek/docs/usage.md#--igenomes_ignore" - -params.knownIndels = null -if (params.knownIndels) exit 1, "The params `--knownIndels` has been removed.\n\tPlease check: https://nf-co.re/sarek/docs/usage.md#--known_indels" - -params.knownIndelsIndex = null -if (params.knownIndelsIndex) exit 1, "The params `--knownIndelsIndex` has been removed.\n\tPlease check: https://nf-co.re/sarek/docs/usage.md#--known_indels_index" - -params.snpeffDb = null -if (params.snpeffDb) exit 1, "The params `--snpeffDb` has been removed.\n\tPlease check: https://nf-co.re/sarek/docs/usage.md#--snpeff_db" - -params.singleCPUMem = null -if (params.singleCPUMem) exit 1, "The params `--singleCPUMem` has been removed.\n\tPlease check: https://nf-co.re/sarek/docs/usage.md#--single_cpu_mem" - -params.vepCacheVersion = null -if (params.vepCacheVersion) exit 1, "The params `--vepCacheVersion` has been removed.\n\tPlease check: https://nf-co.re/sarek/docs/usage.md#--vep_cache_version" /* ================================================================================ @@ -375,6 +38,20 @@ if (params.vepCacheVersion) exit 1, "The params `--vepCacheVersion` has been rem ================================================================================ */ +/* + * Check parameters + */ +Checks.aws_batch(workflow, params) // Check AWS batch settings +Checks.hostname(workflow, params, log) // Check the hostnames against configured profiles + +/* + * MultiQC + * Stage config files + */ +ch_multiqc_config = file("$baseDir/assets/multiqc_config.yaml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() +ch_output_docs = file("$baseDir/docs/output.md", checkIfExists: true) + // Check if genome exists in the config file if (params.genomes && !params.genomes.containsKey(params.genome) && !params.igenomes_ignore) { exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" @@ -408,26 +85,6 @@ if (!checkParameterList(annotateTools,annoList)) exit 1, 'Unknown tool(s) to ann if ((params.ascat_ploidy && !params.ascat_purity) || (!params.ascat_ploidy && params.ascat_purity)) exit 1, 'Please specify both --ascat_purity and --ascat_ploidy, or none of them' if (params.cf_window && params.cf_coeff) exit 1, 'Please specify either --cf_window OR --cf_coeff, but not both of them' -// Has the run name been specified by the user? -// This has the bonus effect of catching both -name and --name -custom_runName = params.name -if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) custom_runName = workflow.runName - -if (workflow.profile.contains('awsbatch')) { - // AWSBatch sanity checking - if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" - // Check outdir paths to be S3 buckets if running on AWSBatch - // related: https://github.com/nextflow-io/nextflow/issues/813 - if (!params.outdir.startsWith('s3:')) exit 1, "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" - // Prevent trace files to be stored on S3 since S3 does not support rolling files. - if (params.tracedir.startsWith('s3:')) exit 1, "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles." -} - -// MultiQC -// Stage config files -ch_multiqc_config = file("$baseDir/assets/multiqc_config.yaml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() -ch_output_docs = file("$baseDir/docs/output.md", checkIfExists: true) // Handle input tsvPath = null @@ -560,160 +217,34 @@ ch_target_bed = params.target_bed ? Channel.value(file(params.target_bed)) : "nu ================================================================================ */ -// Header log info -log.info nfcoreHeader() -def summary = [:] -if (workflow.revision) summary['Pipeline Release'] = workflow.revision -summary['Run Name'] = custom_runName ?: workflow.runName -summary['Max Resources'] = "${params.max_memory} memory, ${params.max_cpus} cpus, ${params.max_time} time per job" -if (workflow.containerEngine) summary['Container'] = "${workflow.containerEngine} - ${workflow.container}" - -summary['Input'] = params.input -summary['Step'] = step -summary['Genome'] = params.genome - -if (params.no_intervals && step != 'annotate') summary['Intervals'] = 'Do not use' -summary['Nucleotides/s'] = params.nucleotides_per_second -if (params.sentieon) summary['Sention'] = "Using Sentieon for Preprocessing and/or Variant Calling" -if (params.skip_qc) summary['QC tools skipped'] = skipQC.join(', ') -if (params.target_bed) summary['Target BED'] = params.target_bed -if (params.tools) summary['Tools'] = tools.join(', ') - -if (params.trim_fastq || params.split_fastq) summary['Modify fastqs (trim/split)'] = "" - -if (params.trim_fastq) { - summary['Fastq trim'] = "Fastq trim selected" - summary['Trim R1'] = "${params.clip_r1} bp" - summary['Trim R2'] = "${params.clip_r2} bp" - summary["Trim 3' R1"] = "${params.three_prime_clip_r1} bp" - summary["Trim 3' R2"] = "${params.three_prime_clip_r2} bp" - summary['NextSeq Trim'] = "${params.trim_nextseq} bp" - summary['Saved Trimmed Fastq'] = params.save_trimmed ? 'Yes' : 'No' -} -if (params.split_fastq) summary['Reads in fastq'] = params.split_fastq - -summary['MarkDuplicates'] = "Options" -summary['Java options'] = params.markdup_java_options -summary['GATK Spark'] = params.no_gatk_spark ? 'No' : 'Yes' - -summary['Save BAMs mapped'] = params.save_bam_mapped ? 'Yes' : 'No' -summary['Skip MarkDuplicates'] = params.skip_markduplicates ? 'Yes' : 'No' - -if ('ascat' in tools) { - summary['ASCAT'] = "Options" - if (params.ascat_purity) summary['purity'] = params.ascat_purity - if (params.ascat_ploidy) summary['ploidy'] = params.ascat_ploidy -} - -if ('controlfreec' in tools) { - summary['Control-FREEC'] = "Options" - if (params.cf_window) summary['window'] = params.cf_window - if (params.cf_coeff) summary['coefficientOfVariation'] = params.cf_coeff - if (params.cf_ploidy) summary['ploidy'] = params.cf_ploidy -} - -if ('haplotypecaller' in tools) summary['GVCF'] = params.no_gvcf ? 'No' : 'Yes' -if ('strelka' in tools && 'manta' in tools) summary['Strelka BP'] = params.no_strelka_bp ? 'No' : 'Yes' -if (params.pon && ('mutect2' in tools || (params.sentieon && 'tnscope' in tools))) summary['Panel of normals'] = params.pon - -if (params.annotate_tools) summary['Tools to annotate'] = annotate_tools.join(', ') - -if (params.annotation_cache) { - summary['Annotation cache'] = "Enabled" - if (params.snpeff_cache) summary['snpEff cache'] = params.snpeff_cache - if (params.vep_cache) summary['VEP cache'] = params.vep_cache -} - -if (params.cadd_cache) { - summary['CADD cache'] = "Enabled" - if (params.cadd_indels) summary['CADD indels'] = params.cadd_indels - if (params.cadd_wg_snvs) summary['CADD wg snvs'] = params.cadd_wg_snvs -} - -if (params.genesplicer) summary['genesplicer'] = "Enabled" - -if (params.igenomes_base && !params.igenomes_ignore) summary['AWS iGenomes base'] = params.igenomes_base -if (params.igenomes_ignore) summary['AWS iGenomes'] = "Do not use" -if (params.genomes_base && !params.igenomes_ignore) summary['Genomes base'] = params.genomes_base - -summary['Save Reference'] = params.save_reference ? 'Yes' : 'No' - -if (params.ac_loci) summary['Loci'] = params.ac_loci -if (params.ac_loci_gc) summary['Loci GC'] = params.ac_loci_gc -if (params.bwa) summary['BWA indexes'] = params.bwa -if (params.chr_dir) summary['Chromosomes'] = params.chr_dir -if (params.chr_length) summary['Chromosomes length'] = params.chr_length -if (params.dbsnp) summary['dbsnp'] = params.dbsnp -if (params.dbsnp_index) summary['dbsnpIndex'] = params.dbsnp_index -if (params.dict) summary['dict'] = params.dict -if (params.fasta) summary['fasta reference'] = params.fasta -if (params.fasta_fai) summary['fasta index'] = params.fasta_fai -if (params.germline_resource) summary['germline resource'] = params.germline_resource -if (params.germline_resource_index) summary['germline resource index'] = params.germline_resource_index -if (params.intervals) summary['intervals'] = params.intervals -if (params.known_indels) summary['known indels'] = params.known_indels -if (params.known_indels_index) summary['known indels index'] = params.known_indels_index -if (params.mappability) summary['Mappability'] = params.mappability -if (params.snpeff_cache) summary['snpEff cache'] = params.snpeff_cache -if (params.snpeff_db) summary['snpEff DB'] = params.snpeff_db -if (params.species) summary['species'] = params.species -if (params.vep_cache) summary['VEP cache'] = params.vep_cache -if (params.vep_cache_version) summary['VEP cache version'] = params.vep_cache_version - -summary['Output dir'] = params.outdir -summary['Publish dir mode'] = params.publish_dir_mode -if (params.sequencing_center) summary['Sequenced by'] = params.sequencing_center - -summary['Launch dir'] = workflow.launchDir -summary['Working dir'] = workflow.workDir -summary['Script dir'] = workflow.projectDir -summary['User'] = workflow.userName - -if (params.multiqc_config) summary['MultiQC config'] = params.multiqc_config - -summary['Config Profile'] = workflow.profile - -if (params.config_profile_description) summary['Config Description'] = params.config_profile_description -if (params.config_profile_contact) summary['Config Contact'] = params.config_profile_contact -if (params.config_profile_url) summary['Config URL'] = params.config_profile_url - -if (params.email || params.email_on_fail) { - summary['E-mail Address'] = params.email - summary['E-mail on failure'] = params.email_on_fail - summary['MultiQC maxsize'] = params.max_multiqc_email_size -} - -if (workflow.profile.contains('awsbatch')) { - summary['AWS Region'] = params.awsregion - summary['AWS Queue'] = params.awsqueue - summary['AWS CLI'] = params.awscli +// Has the run name been specified by the user? +// This has the bonus effect of catching both -name and --name +run_name = params.name +if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { + run_name = workflow.runName } +summary = Schema.params_summary(workflow, params, run_name) +log.info Headers.nf_core(workflow, params, run_name) +log.info summary.collect { k,v -> "${k.padRight(20)}: $v" }.join("\n") +log.info "-\033[2m----------------------------------------------------\033[0m-" -log.info summary.collect { k, v -> "${k.padRight(18)}: $v" }.join("\n") -if (params.monochrome_logs) log.info "----------------------------------------------------" -else log.info "-\033[2m--------------------------------------------------\033[0m-" +// params summary for MultiQC +workflow_summary = Schema.params_mqc_summary(summary) +ch_workflow_summary = Channel.value(workflow_summary) if ('mutect2' in tools && !(params.pon)) log.warn "[nf-core/sarek] Mutect2 was requested, but as no panel of normals were given, results will not be optimal" if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works if Sentieon is available where nf-core/sarek is run" -// Check the hostnames against configured profiles -checkHostname() -Channel.from(summary.collect{ [it.key, it.value] }) - .map { k,v -> "
$k
${v ?: 'N/A'}
" } - .reduce { a, b -> return [a, b].join("\n ") } - .map { x -> """ - id: 'sarek-summary' - description: " - this information is collected when the pipeline is started." - section_name: 'nf-core/sarek Workflow Summary' - section_href: 'https://github.com/nf-core/sarek' - plot_type: 'html' - data: | -
- $x -
- """.stripIndent() } - .set { ch_workflow_summary } +/* +================================================================================ + INCLUDE LOCAL PIPELINE MODULES +================================================================================ +*/ + +include { OUTPUT_DOCUMENTATION } from './modules/local/output_documentation' params(params) +include { GET_SOFTWARE_VERSIONS } from './modules/local/get_software_versions' params(params) +include { CHECK_SAMPLESHEET; check_samplesheet_paths } from './modules/local/check_samplesheet' params(params) // Parse software version numbers diff --git a/modules/local/check_samplesheet.nf b/modules/local/check_samplesheet.nf new file mode 100644 index 0000000000..42de176519 --- /dev/null +++ b/modules/local/check_samplesheet.nf @@ -0,0 +1,34 @@ +/* + * Reformat input samplesheet and check validity + */ +process CHECK_SAMPLESHEET { + tag "$samplesheet" + publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode + + input: + path samplesheet + + output: + path '*.csv' + + script: // This script is bundled with the pipeline, in nf-core/tcrseq/bin/ + """ + check_samplesheet.py $samplesheet samplesheet.valid.csv + """ +} + +// Function to get list of [ sample, single_end?, [ fastq_1, fastq_2 ] ] +def check_samplesheet_paths(LinkedHashMap samplesheet) { + def sample = samplesheet.sample + def single_end = samplesheet.single_end.toBoolean() + def fastq_1 = samplesheet.fastq_1 + def fastq_2 = samplesheet.fastq_2 + + def array = [] + if (single_end) { + array = [ sample, single_end, [ file(fastq_1, checkIfExists: true) ] ] + } else { + array = [ sample, single_end, [ file(fastq_1, checkIfExists: true), file(fastq_2, checkIfExists: true) ] ] + } + return array +} diff --git a/modules/local/get_software_versions.nf b/modules/local/get_software_versions.nf new file mode 100644 index 0000000000..289234b0b2 --- /dev/null +++ b/modules/local/get_software_versions.nf @@ -0,0 +1,24 @@ +/* + * Parse software version numbers + */ +process GET_SOFTWARE_VERSIONS { + publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.indexOf(".csv") > 0) filename + else null + } + + output: + path 'software_versions_mqc.yaml', emit: software_versions_yml + path "software_versions.csv", emit: software_versions_csv + + script: + // TODO nf-core: Get all tools to print their version number here + """ + echo $workflow.manifest.version > v_pipeline.txt + echo $workflow.nextflow.version > v_nextflow.txt + fastqc --version > v_fastqc.txt + multiqc --version > v_multiqc.txt + scrape_software_versions.py &> software_versions_mqc.yaml + """ +} diff --git a/modules/local/output_documentation.nf b/modules/local/output_documentation.nf new file mode 100644 index 0000000000..b6d24b0f58 --- /dev/null +++ b/modules/local/output_documentation.nf @@ -0,0 +1,18 @@ +/* + * Output Markdown documentation to HTML + */ +process OUTPUT_DOCUMENTATION { + publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode + + input: + path output_docs + path images + + output: + path "results_description.html" + + script: + """ + markdown_to_html.py $output_docs -o results_description.html + """ +} diff --git a/modules/nf-core/fastqc.nf b/modules/nf-core/fastqc.nf new file mode 100644 index 0000000000..5129ab4884 --- /dev/null +++ b/modules/nf-core/fastqc.nf @@ -0,0 +1,32 @@ +/* + * FastQC + */ +process FASTQC { + tag "$name" + label 'process_medium' + publishDir "${params.outdir}/fastqc", mode: params.publish_dir_mode, + saveAs: { filename -> + filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename" + } + + input: + tuple val(name), val(single_end), path(reads) + + output: + path "*.{zip,html}" + + script: + // Add soft-links to original FastQs for consistent naming in pipeline + if (single_end) { + """ + [ ! -f ${name}.fastq.gz ] && ln -s $reads ${name}.fastq.gz + fastqc --quiet --threads $task.cpus ${name}.fastq.gz + """ + } else { + """ + [ ! -f ${name}_1.fastq.gz ] && ln -s ${reads[0]} ${name}_1.fastq.gz + [ ! -f ${name}_2.fastq.gz ] && ln -s ${reads[1]} ${name}_2.fastq.gz + fastqc --quiet --threads $task.cpus ${name}_1.fastq.gz ${name}_2.fastq.gz + """ + } +} diff --git a/modules/nf-core/multiqc.nf b/modules/nf-core/multiqc.nf new file mode 100644 index 0000000000..6cec640723 --- /dev/null +++ b/modules/nf-core/multiqc.nf @@ -0,0 +1,36 @@ +// Has the run name been specified by the user? +// this has the bonus effect of catching both -name and --name +custom_runName = params.name +if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { + custom_runName = workflow.runName +} + +/* + * MultiQC + */ +process MULTIQC { + publishDir "${params.outdir}/multiqc", mode: params.publish_dir_mode + + input: + path multiqc_config + path mqc_custom_config + // TODO nf-core: Add in log files from your new processes for MultiQC to find! + path fastqc + path software_versions + val workflow_summary + + output: + path "*multiqc_report.html" + path "*_data" + path "multiqc_plots" + + script: + rtitle = custom_runName ? "--title \"$custom_runName\"" : '' + rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' + custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : '' + // TODO nf-core: Specify which MultiQC modules to use with -m for a faster run time + """ + echo '$workflow_summary' > workflow_summary_mqc.yaml + multiqc -f $rtitle $rfilename $custom_config_file . + """ +} From c721801cdd0b0ff6b9823e9e8ae63dda630f01be Mon Sep 17 00:00:00 2001 From: ggabernet Date: Tue, 14 Jul 2020 11:00:19 +0200 Subject: [PATCH 002/200] first workflow draft --- main.nf | 6789 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 3412 insertions(+), 3377 deletions(-) diff --git a/main.nf b/main.nf index 7a499dfeae..d1cc24b18c 100644 --- a/main.nf +++ b/main.nf @@ -241,3579 +241,3614 @@ if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works INCLUDE LOCAL PIPELINE MODULES ================================================================================ */ - include { OUTPUT_DOCUMENTATION } from './modules/local/output_documentation' params(params) include { GET_SOFTWARE_VERSIONS } from './modules/local/get_software_versions' params(params) include { CHECK_SAMPLESHEET; check_samplesheet_paths } from './modules/local/check_samplesheet' params(params) -// Parse software version numbers - -process Get_software_versions { - publishDir path:"${params.outdir}/pipeline_info", mode: params.publish_dir_mode, - saveAs: {it.indexOf(".csv") > 0 ? it : null} - - output: - file 'software_versions_mqc.yaml' into ch_software_versions_yaml - file "software_versions.csv" - - when: !('versions' in skipQC) - - script: - """ - alleleCounter --version &> v_allelecount.txt 2>&1 || true - bcftools --version &> v_bcftools.txt 2>&1 || true - bwa &> v_bwa.txt 2>&1 || true - cnvkit.py version &> v_cnvkit.txt 2>&1 || true - configManta.py --version &> v_manta.txt 2>&1 || true - configureStrelkaGermlineWorkflow.py --version &> v_strelka.txt 2>&1 || true - echo "${workflow.manifest.version}" &> v_pipeline.txt 2>&1 || true - echo "${workflow.nextflow.version}" &> v_nextflow.txt 2>&1 || true - snpEff -version &> v_snpeff.txt 2>&1 || true - fastqc --version &> v_fastqc.txt 2>&1 || true - freebayes --version &> v_freebayes.txt 2>&1 || true - freec &> v_controlfreec.txt 2>&1 || true - gatk ApplyBQSR --help &> v_gatk.txt 2>&1 || true - msisensor &> v_msisensor.txt 2>&1 || true - multiqc --version &> v_multiqc.txt 2>&1 || true - qualimap --version &> v_qualimap.txt 2>&1 || true - R --version &> v_r.txt 2>&1 || true - R -e "library(ASCAT); help(package='ASCAT')" &> v_ascat.txt 2>&1 || true - samtools --version &> v_samtools.txt 2>&1 || true - tiddit &> v_tiddit.txt 2>&1 || true - trim_galore -v &> v_trim_galore.txt 2>&1 || true - vcftools --version &> v_vcftools.txt 2>&1 || true - vep --help &> v_vep.txt 2>&1 || true - - scrape_software_versions.py &> software_versions_mqc.yaml - """ -} - -ch_software_versions_yaml = ch_software_versions_yaml.dump(tag:'SOFTWARE VERSIONS') - /* ================================================================================ - BUILDING INDEXES + INCLUDE nf-core PIPELINE MODULES ================================================================================ */ +include { FASTQC } from './modules/nf-core/fastqc' params(params) +include { MULTIQC } from './modules/nf-core/multiqc' params(params) -// And then initialize channels based on params or indexes that were just built - -process BuildBWAindexes { - tag "${fasta}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: {params.save_reference ? "reference_genome/BWAIndex/${it}" : null } - - input: - file(fasta) from ch_fasta - - output: - file("${fasta}.*") into bwa_built - - when: !(params.bwa) && params.fasta && 'mapping' in step - - script: - """ - bwa index ${fasta} - """ -} - -ch_bwa = params.bwa ? Channel.value(file(params.bwa)) : bwa_built - -process BuildDict { - tag "${fasta}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: {params.save_reference ? "reference_genome/${it}" : null } - - input: - file(fasta) from ch_fasta - - output: - file("${fasta.baseName}.dict") into dictBuilt - - when: !(params.dict) && params.fasta && !('annotate' in step) && !('controlfreec' in step) - - script: - """ - gatk --java-options "-Xmx${task.memory.toGiga()}g" \ - CreateSequenceDictionary \ - --REFERENCE ${fasta} \ - --OUTPUT ${fasta.baseName}.dict - """ -} - -ch_dict = params.dict ? Channel.value(file(params.dict)) : dictBuilt - -process BuildFastaFai { - tag "${fasta}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: {params.save_reference ? "reference_genome/${it}" : null } - - input: - file(fasta) from ch_fasta - - output: - file("${fasta}.fai") into fai_built - - when: !(params.fasta_fai) && params.fasta && !('annotate' in step) - - script: - """ - samtools faidx ${fasta} - """ -} - -ch_fai = params.fasta_fai ? Channel.value(file(params.fasta_fai)) : fai_built - -process BuildDbsnpIndex { - tag "${dbsnp}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: {params.save_reference ? "reference_genome/${it}" : null } - - input: - file(dbsnp) from ch_dbsnp - - output: - file("${dbsnp}.tbi") into dbsnp_tbi - - when: !(params.dbsnp_index) && params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || 'tnscope' in tools) - - script: - """ - tabix -p vcf ${dbsnp} - """ -} - -ch_dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? Channel.value(file(params.dbsnp_index)) : dbsnp_tbi : "null" - -process BuildGermlineResourceIndex { - tag "${germlineResource}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: {params.save_reference ? "reference_genome/${it}" : null } - - input: - file(germlineResource) from ch_germline_resource - - output: - file("${germlineResource}.tbi") into germline_resource_tbi - - when: !(params.germline_resource_index) && params.germline_resource && 'mutect2' in tools - - script: - """ - tabix -p vcf ${germlineResource} - """ -} - -ch_germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? Channel.value(file(params.germline_resource_index)) : germline_resource_tbi : "null" - -process BuildKnownIndelsIndex { - tag "${knownIndels}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: {params.save_reference ? "reference_genome/${it}" : null } - - input: - each file(knownIndels) from ch_known_indels - - output: - file("${knownIndels}.tbi") into known_indels_tbi - - when: !(params.known_indels_index) && params.known_indels && ('mapping' in step || 'preparerecalibration' in step) - - script: - """ - tabix -p vcf ${knownIndels} - """ -} - -ch_known_indels_tbi = params.known_indels ? params.known_indels_index ? Channel.value(file(params.known_indels_index)) : known_indels_tbi.collect() : "null" - -process BuildPonIndex { - tag "${pon}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: {params.save_reference ? "reference_genome/${it}" : null } - - input: - file(pon) from ch_pon - - output: - file("${pon}.tbi") into pon_tbi - - when: !(params.pon_index) && params.pon && ('tnscope' in tools || 'mutect2' in tools) - - script: - """ - tabix -p vcf ${pon} - """ -} - -ch_pon_tbi = params.pon ? params.pon_index ? Channel.value(file(params.pon_index)) : pon_tbi : "null" - -process BuildIntervals { - tag "${fastaFai}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: {params.save_reference ? "reference_genome/${it}" : null } - - input: - file(fastaFai) from ch_fai - - output: - file("${fastaFai.baseName}.bed") into intervalBuilt - - when: !(params.intervals) && !('annotate' in step) && !('controlfreec' in step) - - script: - """ - awk -v FS='\t' -v OFS='\t' '{ print \$1, \"0\", \$2 }' ${fastaFai} > ${fastaFai.baseName}.bed - """ -} - -ch_intervals = params.no_intervals ? "null" : params.intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : intervalBuilt /* ================================================================================ - PREPROCESSING + RUN THE WORKFLOW ================================================================================ */ -// STEP 0: CREATING INTERVALS FOR PARALLELIZATION (PREPROCESSING AND VARIANT CALLING) - -process CreateIntervalBeds { - tag "${intervals}" - - input: - file(intervals) from ch_intervals - - output: - file '*.bed' into bedIntervals mode flatten - - when: (!params.no_intervals) && step != 'annotate' - - script: - // If the interval file is BED format, the fifth column is interpreted to - // contain runtime estimates, which is then used to combine short-running jobs - if (hasExtension(intervals, "bed")) - """ - awk -vFS="\t" '{ - t = \$5 # runtime estimate - if (t == "") { - # no runtime estimate in this row, assume default value - t = (\$3 - \$2) / ${params.nucleotides_per_second} - } - if (name == "" || (chunk > 600 && (chunk + t) > longest * 1.05)) { - # start a new chunk - name = sprintf("%s_%d-%d.bed", \$1, \$2+1, \$3) - chunk = 0 - longest = 0 - } - if (t > longest) - longest = t - chunk += t - print \$0 > name - }' ${intervals} - """ - else if (hasExtension(intervals, "interval_list")) - """ - grep -v '^@' ${intervals} | awk -vFS="\t" '{ - name = sprintf("%s_%d-%d", \$1, \$2, \$3); - printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed" - }' - """ - else - """ - awk -vFS="[:-]" '{ - name = sprintf("%s_%d-%d", \$1, \$2, \$3); - printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed" - }' ${intervals} - """ -} - -bedIntervals = bedIntervals - .map { intervalFile -> - def duration = 0.0 - for (line in intervalFile.readLines()) { - final fields = line.split('\t') - if (fields.size() >= 5) duration += fields[4].toFloat() - else { - start = fields[1].toInteger() - end = fields[2].toInteger() - duration += (end - start) / params.nucleotides_per_second - } - } - [duration, intervalFile] - }.toSortedList({ a, b -> b[0] <=> a[0] }) - .flatten().collate(2) - .map{duration, intervalFile -> intervalFile} - -bedIntervals = bedIntervals.dump(tag:'bedintervals') - -if (params.no_intervals && step != 'annotate') { - file("${params.outdir}/no_intervals.bed").text = "no_intervals\n" - bedIntervals = Channel.from(file("${params.outdir}/no_intervals.bed")) -} - -(intBaseRecalibrator, intApplyBQSR, intHaplotypeCaller, intFreebayesSingle, intMpileup, bedIntervals) = bedIntervals.into(6) - -// PREPARING CHANNELS FOR PREPROCESSING AND QC - -inputBam = Channel.create() -inputPairReads = Channel.create() - -if (step in ['preparerecalibration', 'recalibrate', 'variantcalling', 'controlfreec', 'annotate']) { - inputBam.close() - inputPairReads.close() -} else inputSample.choice(inputPairReads, inputBam) {hasExtension(it[3], "bam") ? 1 : 0} - -(inputBam, inputBamFastQC) = inputBam.into(2) - -// Removing inputFile2 which is null in case of uBAM -inputBamFastQC = inputBamFastQC.map { - idPatient, idSample, idRun, inputFile1, inputFile2 -> - [idPatient, idSample, idRun, inputFile1] -} - -if (params.split_fastq){ - inputPairReads = inputPairReads - // newly splitfastq are named based on split, so the name is easier to catch - .splitFastq(by: params.split_fastq, compress:true, file:"split", pe:true) - .map {idPatient, idSample, idRun, reads1, reads2 -> - // The split fastq read1 is the 4th element (indexed 3) its name is split_3 - // The split fastq read2's name is split_4 - // It's followed by which split it's acutally based on the mother fastq file - // Index start at 1 - // Extracting the index to get a new IdRun - splitIndex = reads1.fileName.toString().minus("split_3.").minus(".gz") - newIdRun = idRun + "_" + splitIndex - // Giving the files a new nice name - newReads1 = file("${idSample}_${newIdRun}_R1.fastq.gz") - newReads2 = file("${idSample}_${newIdRun}_R2.fastq.gz") - [idPatient, idSample, newIdRun, reads1, reads2]} -} - -inputPairReads = inputPairReads.dump(tag:'INPUT') - -(inputPairReads, inputPairReadsTrimGalore, inputPairReadsFastQC) = inputPairReads.into(3) - -// STEP 0.5: QC ON READS - -// TODO: Use only one process for FastQC for FASTQ files and uBAM files -// FASTQ and uBAM files are renamed based on the sample name - -process FastQCFQ { - label 'FastQC' - label 'cpus_2' - - tag "${idPatient}-${idRun}" - - publishDir "${params.outdir}/Reports/${idSample}/FastQC/${idSample}_${idRun}", mode: params.publish_dir_mode - - input: - set idPatient, idSample, idRun, file("${idSample}_${idRun}_R1.fastq.gz"), file("${idSample}_${idRun}_R2.fastq.gz") from inputPairReadsFastQC - - output: - file("*.{html,zip}") into fastQCFQReport - - when: !('fastqc' in skipQC) - - script: - """ - fastqc -t 2 -q ${idSample}_${idRun}_R1.fastq.gz ${idSample}_${idRun}_R2.fastq.gz - """ -} - -process FastQCBAM { - label 'FastQC' - label 'cpus_2' - - tag "${idPatient}-${idRun}" - - publishDir "${params.outdir}/Reports/${idSample}/FastQC/${idSample}_${idRun}", mode: params.publish_dir_mode - - input: - set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") from inputBamFastQC - - output: - file("*.{html,zip}") into fastQCBAMReport - - when: !('fastqc' in skipQC) - - script: - """ - fastqc -t 2 -q ${idSample}_${idRun}.bam - """ -} - -fastQCReport = fastQCFQReport.mix(fastQCBAMReport) - -fastQCReport = fastQCReport.dump(tag:'FastQC') - -process TrimGalore { - label 'TrimGalore' - - tag "${idPatient}-${idRun}" - - publishDir "${params.outdir}/Reports/${idSample}/TrimGalore/${idSample}_${idRun}", mode: params.publish_dir_mode, - saveAs: {filename -> - if (filename.indexOf("_fastqc") > 0) "FastQC/$filename" - else if (filename.indexOf("trimming_report.txt") > 0) "logs/$filename" - else if (params.save_trimmed) filename - else null - } - - input: - set idPatient, idSample, idRun, file("${idSample}_${idRun}_R1.fastq.gz"), file("${idSample}_${idRun}_R2.fastq.gz") from inputPairReadsTrimGalore - - output: - file("*.{html,zip,txt}") into trimGaloreReport - set idPatient, idSample, idRun, file("${idSample}_${idRun}_R1_val_1.fq.gz"), file("${idSample}_${idRun}_R2_val_2.fq.gz") into outputPairReadsTrimGalore - - when: params.trim_fastq - - script: - // Calculate number of --cores for TrimGalore based on value of task.cpus - // See: https://github.com/FelixKrueger/TrimGalore/blob/master/Changelog.md#version-060-release-on-1-mar-2019 - // See: https://github.com/nf-core/atacseq/pull/65 - def cores = 1 - if (task.cpus) { - cores = (task.cpus as int) - 4 - if (cores < 1) cores = 1 - if (cores > 4) cores = 4 - } - c_r1 = params.clip_r1 > 0 ? "--clip_r1 ${params.clip_r1}" : '' - c_r2 = params.clip_r2 > 0 ? "--clip_r2 ${params.clip_r2}" : '' - tpc_r1 = params.three_prime_clip_r1 > 0 ? "--three_prime_clip_r1 ${params.three_prime_clip_r1}" : '' - tpc_r2 = params.three_prime_clip_r2 > 0 ? "--three_prime_clip_r2 ${params.three_prime_clip_r2}" : '' - nextseq = params.trim_nextseq > 0 ? "--nextseq ${params.trim_nextseq}" : '' - """ - trim_galore \ - --cores ${cores} \ - --paired \ - --fastqc \ - --gzip \ - ${c_r1} ${c_r2} \ - ${tpc_r1} ${tpc_r2} \ - ${nextseq} \ - ${idSample}_${idRun}_R1.fastq.gz ${idSample}_${idRun}_R2.fastq.gz - - mv *val_1_fastqc.html "${idSample}_${idRun}_R1.trimmed_fastqc.html" - mv *val_2_fastqc.html "${idSample}_${idRun}_R2.trimmed_fastqc.html" - mv *val_1_fastqc.zip "${idSample}_${idRun}_R1.trimmed_fastqc.zip" - mv *val_2_fastqc.zip "${idSample}_${idRun}_R2.trimmed_fastqc.zip" - """ -} - -if (!params.trim_fastq) inputPairReadsTrimGalore.close() - -// STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA MEM - -if (params.trim_fastq) inputPairReads = outputPairReadsTrimGalore -else inputPairReads = inputPairReads.mix(inputBam) - -inputPairReads = inputPairReads.dump(tag:'INPUT') - -(inputPairReads, input_pair_reads_sentieon) = inputPairReads.into(2) -if (params.sentieon) inputPairReads.close() -else input_pair_reads_sentieon.close() - -process MapReads { - label 'cpus_max' - - tag "${idPatient}-${idRun}" - - input: - set idPatient, idSample, idRun, file(inputFile1), file(inputFile2) from inputPairReads - file(bwaIndex) from ch_bwa - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - - output: - set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") into bamMapped - set idPatient, val("${idSample}_${idRun}"), file("${idSample}_${idRun}.bam") into bamMappedBamQC - - when: !(params.sentieon) - - script: - // -K is an hidden option, used to fix the number of reads processed by bwa mem - // Chunk size can affect bwa results, if not specified, - // the number of threads can change which can give not deterministic result. - // cf https://github.com/CCDG/Pipeline-Standardization/blob/master/PipelineStandard.md - // and https://github.com/gatk-workflows/gatk4-data-processing/blob/8ffa26ff4580df4ac3a5aa9e272a4ff6bab44ba2/processing-for-variant-discovery-gatk4.b37.wgs.inputs.json#L29 - CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : "" - readGroup = "@RG\\tID:${idRun}\\t${CN}PU:${idRun}\\tSM:${idSample}\\tLB:${idSample}\\tPL:illumina" - // adjust mismatch penalty for tumor samples - status = statusMap[idPatient, idSample] - extra = status == 1 ? "-B 3" : "" - convertToFastq = hasExtension(inputFile1, "bam") ? "gatk --java-options -Xmx${task.memory.toGiga()}g SamToFastq --INPUT=${inputFile1} --FASTQ=/dev/stdout --INTERLEAVE=true --NON_PF=true | \\" : "" - input = hasExtension(inputFile1, "bam") ? "-p /dev/stdin - 2> >(tee ${inputFile1}.bwa.stderr.log >&2)" : "${inputFile1} ${inputFile2}" - """ - ${convertToFastq} - bwa mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${fasta} \ - ${input} | \ - samtools sort --threads ${task.cpus} -m 2G - > ${idSample}_${idRun}.bam - """ -} - -bamMapped = bamMapped.dump(tag:'Mapped BAM') -// Sort BAM whether they are standalone or should be merged - -singleBam = Channel.create() -multipleBam = Channel.create() -bamMapped.groupTuple(by:[0, 1]) - .choice(singleBam, multipleBam) {it[2].size() > 1 ? 1 : 0} -singleBam = singleBam.map { - idPatient, idSample, idRun, bam -> - [idPatient, idSample, bam] -} -singleBam = singleBam.dump(tag:'Single BAM') - -// STEP 1': MAPPING READS TO REFERENCE GENOME WITH SENTIEON BWA MEM - -process Sentieon_MapReads { - label 'cpus_max' - label 'memory_max' - label 'sentieon' - - tag "${idPatient}-${idRun}" - - input: - set idPatient, idSample, idRun, file(inputFile1), file(inputFile2) from input_pair_reads_sentieon - file(bwaIndex) from ch_bwa - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - - output: - set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") into bam_sentieon_mapped - - when: params.sentieon - - script: - // -K is an hidden option, used to fix the number of reads processed by bwa mem - // Chunk size can affect bwa results, if not specified, - // the number of threads can change which can give not deterministic result. - // cf https://github.com/CCDG/Pipeline-Standardization/blob/master/PipelineStandard.md - // and https://github.com/gatk-workflows/gatk4-data-processing/blob/8ffa26ff4580df4ac3a5aa9e272a4ff6bab44ba2/processing-for-variant-discovery-gatk4.b37.wgs.inputs.json#L29 - CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : "" - readGroup = "@RG\\tID:${idRun}\\t${CN}PU:${idRun}\\tSM:${idSample}\\tLB:${idSample}\\tPL:illumina" - // adjust mismatch penalty for tumor samples - status = statusMap[idPatient, idSample] - extra = status == 1 ? "-B 3" : "" - """ - sentieon bwa mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${fasta} \ - ${inputFile1} ${inputFile2} | \ - sentieon util sort -r ${fasta} -o ${idSample}_${idRun}.bam -t ${task.cpus} --sam2bam -i - - """ -} - -bam_sentieon_mapped = bam_sentieon_mapped.dump(tag:'Sentieon Mapped BAM') -// Sort BAM whether they are standalone or should be merged - -singleBamSentieon = Channel.create() -multipleBamSentieon = Channel.create() -bam_sentieon_mapped.groupTuple(by:[0, 1]) - .choice(singleBamSentieon, multipleBamSentieon) {it[2].size() > 1 ? 1 : 0} -singleBamSentieon = singleBamSentieon.map { - idPatient, idSample, idRun, bam -> - [idPatient, idSample, bam] -} -singleBamSentieon = singleBamSentieon.dump(tag:'Single BAM') - -// STEP 1.5: MERGING BAM FROM MULTIPLE LANES - -multipleBam = multipleBam.mix(multipleBamSentieon) - -process MergeBamMapped { - label 'cpus_8' - - tag "${idPatient}-${idSample}" - - input: - set idPatient, idSample, idRun, file(bam) from multipleBam - - output: - set idPatient, idSample, file("${idSample}.bam") into bam_mapped_merged - - script: - """ - samtools merge --threads ${task.cpus} ${idSample}.bam ${bam} - """ -} - -bam_mapped_merged = bam_mapped_merged.dump(tag:'Merged BAM') - -bam_mapped_merged = bam_mapped_merged.mix(singleBam,singleBamSentieon) - -(bam_mapped_merged, bam_sentieon_mapped_merged) = bam_mapped_merged.into(2) +workflow { -if (!params.sentieon) bam_sentieon_mapped_merged.close() -else bam_mapped_merged.close() + CHECK_SAMPLESHEET(ch_input) + .splitCsv(header:true, sep:',') + .map { check_samplesheet_paths(it) } + .set { ch_raw_reads } -bam_mapped_merged = bam_mapped_merged.dump(tag:'BAMs for MD') -bam_sentieon_mapped_merged = bam_sentieon_mapped_merged.dump(tag:'Sentieon BAMs to Index') + FASTQC(ch_raw_reads) -process IndexBamMergedForSentieon { - label 'cpus_8' + OUTPUT_DOCUMENTATION( + ch_output_docs, + ch_output_docs_images) - tag "${idPatient}-${idSample}" - - input: - set idPatient, idSample, file("${idSample}.bam") from bam_sentieon_mapped_merged - - output: - set idPatient, idSample, file("${idSample}.bam"), file("${idSample}.bam.bai") into bam_sentieon_mapped_merged_indexed - - script: - """ - samtools index ${idSample}.bam - """ -} - -(bam_mapped_merged, bam_mapped_merged_to_index) = bam_mapped_merged.into(2) - -process IndexBamFile { - label 'cpus_8' - - tag "${idPatient}-${idSample}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { - if (save_bam_mapped) "Preprocessing/${idSample}/Mapped/${it}" - else null - } - - input: - set idPatient, idSample, file("${idSample}.bam") from bam_mapped_merged_to_index - - output: - set idPatient, idSample, file("${idSample}.bam"), file("${idSample}.bam.bai") into bam_mapped_merged_indexed - set idPatient, idSample into tsv_bam_indexed - - when: save_bam_mapped || !(params.known_indels) - - script: - """ - samtools index ${idSample}.bam - """ -} - -if (!save_bam_mapped) tsv_bam_indexed.close() - -(tsv_bam_indexed, tsv_bam_indexed_sample) = tsv_bam_indexed.into(2) - -// Creating a TSV file to restart from this step -tsv_bam_indexed.map { idPatient, idSample -> - gender = genderMap[idPatient] - status = statusMap[idPatient, idSample] - bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" - "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" -}.collectFile( - name: 'mapped.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" -) - -tsv_bam_indexed_sample - .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> - status = statusMap[idPatient, idSample] - gender = genderMap[idPatient] - bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" - ["mapped_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] -} -// STEP 2: MARKING DUPLICATES - -process MarkDuplicates { - label 'cpus_16' - - tag "${idPatient}-${idSample}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { - if (it == "${idSample}.bam.metrics") "Reports/${idSample}/MarkDuplicates/${it}" - else "Preprocessing/${idSample}/DuplicatesMarked/${it}" - } - - input: - set idPatient, idSample, file("${idSample}.bam") from bam_mapped_merged - - output: - set idPatient, idSample, file("${idSample}.md.bam"), file("${idSample}.md.bam.bai") into bam_duplicates_marked - set idPatient, idSample into tsv_bam_duplicates_marked - file ("${idSample}.bam.metrics") optional true into duplicates_marked_report - - when: !(params.skip_markduplicates) - - script: - markdup_java_options = task.memory.toGiga() > 8 ? params.markdup_java_options : "\"-Xms" + (task.memory.toGiga() / 2).trunc() + "g -Xmx" + (task.memory.toGiga() - 1) + "g\"" - metrics = 'markduplicates' in skipQC ? '' : "-M ${idSample}.bam.metrics" - if (params.no_gatk_spark) - """ - gatk --java-options ${markdup_java_options} \ - MarkDuplicates \ - --MAX_RECORDS_IN_RAM 50000 \ - --INPUT ${idSample}.bam \ - --METRICS_FILE ${idSample}.bam.metrics \ - --TMP_DIR . \ - --ASSUME_SORT_ORDER coordinate \ - --CREATE_INDEX true \ - --OUTPUT ${idSample}.md.bam - - mv ${idSample}.md.bai ${idSample}.md.bam.bai - """ - else - """ - gatk --java-options ${markdup_java_options} \ - MarkDuplicatesSpark \ - -I ${idSample}.bam \ - -O ${idSample}.md.bam \ - ${metrics} \ - --tmp-dir . \ - --create-output-bam-index true \ - --spark-master local[${task.cpus}] - """ + GET_SOFTWARE_VERSIONS() + + MULTIQC( + ch_multiqc_config, + ch_multiqc_custom_config.collect().ifEmpty([]), + FASTQC.out.collect(), + GET_SOFTWARE_VERSIONS.out.software_versions_yml.collect(), + ch_workflow_summary) } -(tsv_bam_duplicates_marked, tsv_bam_duplicates_marked_sample) = tsv_bam_duplicates_marked.into(2) - -// Creating a TSV file to restart from this step -tsv_bam_duplicates_marked.map { idPatient, idSample -> - gender = genderMap[idPatient] - status = statusMap[idPatient, idSample] - bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" - "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" -}.collectFile( - name: 'duplicates_marked_no_table.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" -) - -tsv_bam_duplicates_marked_sample - .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> - status = statusMap[idPatient, idSample] - gender = genderMap[idPatient] - bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" - ["duplicates_marked_no_table_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] -} - -if ('markduplicates' in skipQC) duplicates_marked_report.close() - -if (step == 'preparerecalibration') bam_duplicates_marked = inputSample - -bam_duplicates_marked = bam_duplicates_marked.dump(tag:'MD BAM') -duplicates_marked_report = duplicates_marked_report.dump(tag:'MD Report') - -if (params.skip_markduplicates) bam_duplicates_marked = bam_mapped_merged_indexed - -(bamMD, bamMDToJoin, bam_duplicates_marked) = bam_duplicates_marked.into(3) - -bamBaseRecalibrator = bamMD.combine(intBaseRecalibrator) - -bamBaseRecalibrator = bamBaseRecalibrator.dump(tag:'BAM FOR BASERECALIBRATOR') - -// STEP 2': SENTIEON DEDUP - -process Sentieon_Dedup { - label 'cpus_max' - label 'memory_max' - label 'sentieon' - - tag "${idPatient}-${idSample}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { - if (it == "${idSample}_*.txt" && 'sentieon' in skipQC) null - else if (it == "${idSample}_*.txt") "Reports/${idSample}/Sentieon/${it}" - else "Preprocessing/${idSample}/DedupedSentieon/${it}" - } - - input: - set idPatient, idSample, file(bam), file(bai) from bam_sentieon_mapped_merged_indexed - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - - output: - set idPatient, idSample, file("${idSample}.deduped.bam"), file("${idSample}.deduped.bam.bai") into bam_sentieon_dedup - - when: params.sentieon - - script: - """ - sentieon driver \ - -t ${task.cpus} \ - -i ${bam} \ - -r ${fasta} \ - --algo GCBias --summary ${idSample}_gc_summary.txt ${idSample}_gc_metric.txt \ - --algo MeanQualityByCycle ${idSample}_mq_metric.txt \ - --algo QualDistribution ${idSample}_qd_metric.txt \ - --algo InsertSizeMetricAlgo ${idSample}_is_metric.txt \ - --algo AlignmentStat ${idSample}_aln_metric.txt - - sentieon driver \ - -t ${task.cpus} \ - -i ${bam} \ - --algo LocusCollector \ - --fun score_info ${idSample}_score.gz - - sentieon driver \ - -t ${task.cpus} \ - -i ${bam} \ - --algo Dedup \ - --rmdup \ - --score_info ${idSample}_score.gz \ - --metrics ${idSample}_dedup_metric.txt ${idSample}.deduped.bam - """ -} - -// STEP 3: CREATING RECALIBRATION TABLES - -process BaseRecalibrator { - label 'cpus_1' - - tag "${idPatient}-${idSample}-${intervalBed.baseName}" - - input: - set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamBaseRecalibrator - file(dbsnp) from ch_dbsnp - file(dbsnpIndex) from ch_dbsnp_tbi - file(fasta) from ch_fasta - file(dict) from ch_dict - file(fastaFai) from ch_fai - file(knownIndels) from ch_known_indels - file(knownIndelsIndex) from ch_known_indels_tbi - - output: - set idPatient, idSample, file("${prefix}${idSample}.recal.table") into tableGatherBQSRReports - set idPatient, idSample into recalTableTSVnoInt - - when: params.known_indels - - script: - dbsnpOptions = params.dbsnp ? "--known-sites ${dbsnp}" : "" - knownOptions = params.known_indels ? knownIndels.collect{"--known-sites ${it}"}.join(' ') : "" - prefix = params.no_intervals ? "" : "${intervalBed.baseName}_" - intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" - // TODO: --use-original-qualities ??? - """ - gatk --java-options -Xmx${task.memory.toGiga()}g \ - BaseRecalibrator \ - -I ${bam} \ - -O ${prefix}${idSample}.recal.table \ - --tmp-dir . \ - -R ${fasta} \ - ${intervalsOptions} \ - ${dbsnpOptions} \ - ${knownOptions} \ - --verbosity INFO - """ -} - -if (!params.no_intervals) tableGatherBQSRReports = tableGatherBQSRReports.groupTuple(by:[0, 1]) - -tableGatherBQSRReports = tableGatherBQSRReports.dump(tag:'BQSR REPORTS') - -if (params.no_intervals) { - (tableGatherBQSRReports, tableGatherBQSRReportsNoInt) = tableGatherBQSRReports.into(2) - recalTable = tableGatherBQSRReportsNoInt -} else recalTableTSVnoInt.close() - -// STEP 3.5: MERGING RECALIBRATION TABLES - -process GatherBQSRReports { - label 'memory_singleCPU_2_task' - label 'cpus_2' - - tag "${idPatient}-${idSample}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { - if (it == "${idSample}.recal.table" && !params.skip_markduplicates) "Preprocessing/${idSample}/DuplicatesMarked/${it}" - else "Preprocessing/${idSample}/Mapped/${it}" - } - - input: - set idPatient, idSample, file(recal) from tableGatherBQSRReports - - output: - set idPatient, idSample, file("${idSample}.recal.table") into recalTable - file("${idSample}.recal.table") into baseRecalibratorReport - set idPatient, idSample into recalTableTSV - - when: !(params.no_intervals) - - script: - input = recal.collect{"-I ${it}"}.join(' ') - """ - gatk --java-options -Xmx${task.memory.toGiga()}g \ - GatherBQSRReports \ - ${input} \ - -O ${idSample}.recal.table \ - """ -} - -if ('baserecalibrator' in skipQC) baseRecalibratorReport.close() - -recalTable = recalTable.dump(tag:'RECAL TABLE') - -(recalTableTSV, recalTableSampleTSV) = recalTableTSV.mix(recalTableTSVnoInt).into(2) - -// Create TSV files to restart from this step -if (params.skip_markduplicates) { - recalTableTSV.map { idPatient, idSample -> - status = statusMap[idPatient, idSample] - gender = genderMap[idPatient] - bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" - recalTable = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.recal.table" - "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n" - }.collectFile( - name: 'mapped_no_duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" - ) - - recalTableSampleTSV - .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV/") { - idPatient, idSample -> - status = statusMap[idPatient, idSample] - gender = genderMap[idPatient] - bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" - recalTable = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.recal.table" - ["mapped_no_duplicates_marked_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n"] - } -} else { - recalTableTSV.map { idPatient, idSample -> - status = statusMap[idPatient, idSample] - gender = genderMap[idPatient] - bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" - recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.recal.table" - - "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n" - }.collectFile( - name: 'duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" - ) - - recalTableSampleTSV - .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV/") { - idPatient, idSample -> - status = statusMap[idPatient, idSample] - gender = genderMap[idPatient] - bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" - recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.recal.table" - ["duplicates_marked_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n"] - } -} - -bamApplyBQSR = bamMDToJoin.join(recalTable, by:[0,1]) - -if (step == 'recalibrate') bamApplyBQSR = inputSample - -bamApplyBQSR = bamApplyBQSR.dump(tag:'BAM + BAI + RECAL TABLE') - -bamApplyBQSR = bamApplyBQSR.combine(intApplyBQSR) - -bamApplyBQSR = bamApplyBQSR.dump(tag:'BAM + BAI + RECAL TABLE + INT') - -// STEP 4: RECALIBRATING - -process ApplyBQSR { - label 'memory_singleCPU_2_task' - label 'cpus_2' - - tag "${idPatient}-${idSample}-${intervalBed.baseName}" +// process Get_software_versions { +// publishDir path:"${params.outdir}/pipeline_info", mode: params.publish_dir_mode, +// saveAs: {it.indexOf(".csv") > 0 ? it : null} - input: - set idPatient, idSample, file(bam), file(bai), file(recalibrationReport), file(intervalBed) from bamApplyBQSR - file(dict) from ch_dict - file(fasta) from ch_fasta - file(fastaFai) from ch_fai +// output: +// file 'software_versions_mqc.yaml' into ch_software_versions_yaml +// file "software_versions.csv" - output: - set idPatient, idSample, file("${prefix}${idSample}.recal.bam") into bam_recalibrated_to_merge +// when: !('versions' in skipQC) - script: - prefix = params.no_intervals ? "" : "${intervalBed.baseName}_" - intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" - """ - gatk --java-options -Xmx${task.memory.toGiga()}g \ - ApplyBQSR \ - -R ${fasta} \ - --input ${bam} \ - --output ${prefix}${idSample}.recal.bam \ - ${intervalsOptions} \ - --bqsr-recal-file ${recalibrationReport} - """ -} +// script: +// """ +// alleleCounter --version &> v_allelecount.txt 2>&1 || true +// bcftools --version &> v_bcftools.txt 2>&1 || true +// bwa &> v_bwa.txt 2>&1 || true +// cnvkit.py version &> v_cnvkit.txt 2>&1 || true +// configManta.py --version &> v_manta.txt 2>&1 || true +// configureStrelkaGermlineWorkflow.py --version &> v_strelka.txt 2>&1 || true +// echo "${workflow.manifest.version}" &> v_pipeline.txt 2>&1 || true +// echo "${workflow.nextflow.version}" &> v_nextflow.txt 2>&1 || true +// snpEff -version &> v_snpeff.txt 2>&1 || true +// fastqc --version &> v_fastqc.txt 2>&1 || true +// freebayes --version &> v_freebayes.txt 2>&1 || true +// freec &> v_controlfreec.txt 2>&1 || true +// gatk ApplyBQSR --help &> v_gatk.txt 2>&1 || true +// msisensor &> v_msisensor.txt 2>&1 || true +// multiqc --version &> v_multiqc.txt 2>&1 || true +// qualimap --version &> v_qualimap.txt 2>&1 || true +// R --version &> v_r.txt 2>&1 || true +// R -e "library(ASCAT); help(package='ASCAT')" &> v_ascat.txt 2>&1 || true +// samtools --version &> v_samtools.txt 2>&1 || true +// tiddit &> v_tiddit.txt 2>&1 || true +// trim_galore -v &> v_trim_galore.txt 2>&1 || true +// vcftools --version &> v_vcftools.txt 2>&1 || true +// vep --help &> v_vep.txt 2>&1 || true -(bam_recalibrated_to_merge, bam_recalibrated_to_index) = bam_recalibrated_to_merge.groupTuple(by:[0, 1]).into(2) - -// STEP 4': SENTIEON BQSR - -bam_sentieon_dedup = bam_sentieon_dedup.dump(tag:'deduped.bam') - -process Sentieon_BQSR { - label 'cpus_max' - label 'memory_max' - label 'sentieon' - - tag "${idPatient}-${idSample}" - - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { - if (it == "${idSample}_recal_result.csv" && 'sentieon' in skipQC) "Reports/${idSample}/Sentieon/${it}" - else "Preprocessing/${idSample}/RecalSentieon/${it}" - } - - input: - set idPatient, idSample, file(bam), file(bai) from bam_sentieon_dedup - file(dbsnp) from ch_dbsnp - file(dbsnpIndex) from ch_dbsnp_tbi - file(fasta) from ch_fasta - file(dict) from ch_dict - file(fastaFai) from ch_fai - file(knownIndels) from ch_known_indels - file(knownIndelsIndex) from ch_known_indels_tbi - - output: - set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bam_sentieon_recal - set idPatient, idSample, file(bam), file(bai), file("${idSample}.recal.table") into bam_sentieon_deduped_table - set idPatient, idSample into tsv_sentieon - - when: params.sentieon - - script: - known = knownIndels.collect{"--known-sites ${it}"}.join(' ') - """ - sentieon driver \ - -t ${task.cpus} \ - -r ${fasta} \ - -i ${idSample}.deduped.bam \ - --algo QualCal \ - -k ${dbsnp} \ - ${idSample}.recal.table - - sentieon driver \ - -t ${task.cpus} \ - -r ${fasta} \ - -i ${idSample}.deduped.bam \ - -q ${idSample}.recal.table \ - --algo QualCal \ - -k ${dbsnp} \ - ${idSample}.table.post \ - --algo ReadWriter ${idSample}.recal.bam - - sentieon driver \ - -t ${task.cpus} \ - --algo QualCal \ - --plot \ - --before ${idSample}.recal.table \ - --after ${idSample}.table.post \ - ${idSample}_recal_result.csv - """ -} +// scrape_software_versions.py &> software_versions_mqc.yaml +// """ +// } -(tsv_sentieon_deduped, tsv_sentieon_deduped_sample, tsv_sentieon_recal, tsv_sentieon_recal_sample) = tsv_sentieon.into(4) - -// Creating a TSV file to restart from this step -tsv_sentieon_deduped.map { idPatient, idSample -> - gender = genderMap[idPatient] - status = statusMap[idPatient, idSample] - bam = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam.bai" - table = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.table" - "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${table}\n" -}.collectFile( - name: 'sentieon_deduped.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" -) - -tsv_sentieon_deduped_sample - .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> - status = statusMap[idPatient, idSample] - gender = genderMap[idPatient] - bam = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam.bai" - table = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.table" - ["sentieon_deduped_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${table}\n"] -} +// ch_software_versions_yaml = ch_software_versions_yaml.dump(tag:'SOFTWARE VERSIONS') -// Creating a TSV file to restart from this step -tsv_sentieon_recal.map { idPatient, idSample -> - gender = genderMap[idPatient] - status = statusMap[idPatient, idSample] - bam = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam.bai" - "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" -}.collectFile( - name: 'sentieon_recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" -) - -tsv_sentieon_recal_sample - .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> - status = statusMap[idPatient, idSample] - gender = genderMap[idPatient] - bam = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam.bai" - ["sentieon_recalibrated_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] -} +// /* +// ================================================================================ +// BUILDING INDEXES +// ================================================================================ +// */ -// STEP 4.5: MERGING THE RECALIBRATED BAM FILES +// // And then initialize channels based on params or indexes that were just built -process MergeBamRecal { - label 'cpus_8' +// process BuildBWAindexes { +// tag "${fasta}" - tag "${idPatient}-${idSample}" +// publishDir params.outdir, mode: params.publish_dir_mode, +// saveAs: {params.save_reference ? "reference_genome/BWAIndex/${it}" : null } - publishDir "${params.outdir}/Preprocessing/${idSample}/Recalibrated", mode: params.publish_dir_mode +// input: +// file(fasta) from ch_fasta - input: - set idPatient, idSample, file(bam) from bam_recalibrated_to_merge +// output: +// file("${fasta}.*") into bwa_built - output: - set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bam_recalibrated - set idPatient, idSample, file("${idSample}.recal.bam") into bam_recalibrated_qc - set idPatient, idSample into tsv_bam_recalibrated +// when: !(params.bwa) && params.fasta && 'mapping' in step - when: !(params.no_intervals) +// script: +// """ +// bwa index ${fasta} +// """ +// } - script: - """ - samtools merge --threads ${task.cpus} ${idSample}.recal.bam ${bam} - samtools index ${idSample}.recal.bam - """ -} +// ch_bwa = params.bwa ? Channel.value(file(params.bwa)) : bwa_built -// STEP 4.5': INDEXING THE RECALIBRATED BAM FILES +// process BuildDict { +// tag "${fasta}" -process IndexBamRecal { - label 'cpus_8' +// publishDir params.outdir, mode: params.publish_dir_mode, +// saveAs: {params.save_reference ? "reference_genome/${it}" : null } - tag "${idPatient}-${idSample}" +// input: +// file(fasta) from ch_fasta - publishDir "${params.outdir}/Preprocessing/${idSample}/Recalibrated", mode: params.publish_dir_mode +// output: +// file("${fasta.baseName}.dict") into dictBuilt - input: - set idPatient, idSample, file("${idSample}.recal.bam") from bam_recalibrated_to_index +// when: !(params.dict) && params.fasta && !('annotate' in step) && !('controlfreec' in step) - output: - set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bam_recalibrated_indexed - set idPatient, idSample, file("${idSample}.recal.bam") into bam_recalibrated_no_int_qc - set idPatient, idSample into tsv_bam_recalibrated_no_int +// script: +// """ +// gatk --java-options "-Xmx${task.memory.toGiga()}g" \ +// CreateSequenceDictionary \ +// --REFERENCE ${fasta} \ +// --OUTPUT ${fasta.baseName}.dict +// """ +// } - when: params.no_intervals +// ch_dict = params.dict ? Channel.value(file(params.dict)) : dictBuilt - script: - """ - samtools index ${idSample}.recal.bam - """ -} +// process BuildFastaFai { +// tag "${fasta}" -bam_recalibrated = bam_recalibrated.mix(bam_recalibrated_indexed) -bam_recalibrated_qc = bam_recalibrated_qc.mix(bam_recalibrated_no_int_qc) -tsv_bam_recalibrated = tsv_bam_recalibrated.mix(tsv_bam_recalibrated_no_int) - -(bam_recalibrated_bamqc, bam_recalibrated_samtools_stats) = bam_recalibrated_qc.into(2) -(tsv_bam_recalibrated, tsv_bam_recalibrated_sample) = tsv_bam_recalibrated.into(2) - -// Creating a TSV file to restart from this step -tsv_bam_recalibrated.map { idPatient, idSample -> - gender = genderMap[idPatient] - status = statusMap[idPatient, idSample] - bam = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam.bai" - "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" -}.collectFile( - name: 'recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" -) - -tsv_bam_recalibrated_sample - .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { - idPatient, idSample -> - status = statusMap[idPatient, idSample] - gender = genderMap[idPatient] - bam = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam.bai" - ["recalibrated_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] -} +// publishDir params.outdir, mode: params.publish_dir_mode, +// saveAs: {params.save_reference ? "reference_genome/${it}" : null } -// STEP 5: QC +// input: +// file(fasta) from ch_fasta -process SamtoolsStats { - label 'cpus_2' +// output: +// file("${fasta}.fai") into fai_built - tag "${idPatient}-${idSample}" +// when: !(params.fasta_fai) && params.fasta && !('annotate' in step) - publishDir "${params.outdir}/Reports/${idSample}/SamToolsStats", mode: params.publish_dir_mode +// script: +// """ +// samtools faidx ${fasta} +// """ +// } - input: - set idPatient, idSample, file(bam) from bam_recalibrated_samtools_stats +// ch_fai = params.fasta_fai ? Channel.value(file(params.fasta_fai)) : fai_built - output: - file ("${bam}.samtools.stats.out") into samtoolsStatsReport +// process BuildDbsnpIndex { +// tag "${dbsnp}" - when: !('samtools' in skipQC) +// publishDir params.outdir, mode: params.publish_dir_mode, +// saveAs: {params.save_reference ? "reference_genome/${it}" : null } - script: - """ - samtools stats ${bam} > ${bam}.samtools.stats.out - """ -} +// input: +// file(dbsnp) from ch_dbsnp -samtoolsStatsReport = samtoolsStatsReport.dump(tag:'SAMTools') +// output: +// file("${dbsnp}.tbi") into dbsnp_tbi -bamBamQC = bamMappedBamQC.mix(bam_recalibrated_bamqc) +// when: !(params.dbsnp_index) && params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || 'tnscope' in tools) -process BamQC { - label 'memory_max' - label 'cpus_16' +// script: +// """ +// tabix -p vcf ${dbsnp} +// """ +// } - tag "${idPatient}-${idSample}" +// ch_dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? Channel.value(file(params.dbsnp_index)) : dbsnp_tbi : "null" - publishDir "${params.outdir}/Reports/${idSample}/bamQC", mode: params.publish_dir_mode +// process BuildGermlineResourceIndex { +// tag "${germlineResource}" - input: - set idPatient, idSample, file(bam) from bamBamQC - file(targetBED) from ch_target_bed +// publishDir params.outdir, mode: params.publish_dir_mode, +// saveAs: {params.save_reference ? "reference_genome/${it}" : null } - output: - file("${bam.baseName}") into bamQCReport +// input: +// file(germlineResource) from ch_germline_resource - when: !('bamqc' in skipQC) +// output: +// file("${germlineResource}.tbi") into germline_resource_tbi - script: - use_bed = params.target_bed ? "-gff ${targetBED}" : '' - """ - qualimap --java-mem-size=${task.memory.toGiga()}G \ - bamqc \ - -bam ${bam} \ - --paint-chromosome-limits \ - --genome-gc-distr HUMAN \ - $use_bed \ - -nt ${task.cpus} \ - -skip-duplicated \ - --skip-dup-mode 0 \ - -outdir ${bam.baseName} \ - -outformat HTML - """ -} +// when: !(params.germline_resource_index) && params.germline_resource && 'mutect2' in tools -bamQCReport = bamQCReport.dump(tag:'BamQC') +// script: +// """ +// tabix -p vcf ${germlineResource} +// """ +// } -/* -================================================================================ - GERMLINE VARIANT CALLING -================================================================================ -*/ +// ch_germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? Channel.value(file(params.germline_resource_index)) : germline_resource_tbi : "null" -// When using sentieon for mapping, Channel bam_recalibrated is bam_sentieon_recal -if (params.sentieon && step == 'mapping') bam_recalibrated = bam_sentieon_recal +// process BuildKnownIndelsIndex { +// tag "${knownIndels}" -// When no knownIndels for mapping, Channel bam_recalibrated is bam_duplicates_marked -if (!params.known_indels && step == 'mapping') bam_recalibrated = bam_duplicates_marked +// publishDir params.outdir, mode: params.publish_dir_mode, +// saveAs: {params.save_reference ? "reference_genome/${it}" : null } -// When starting with variant calling, Channel bam_recalibrated is inputSample -if (step == 'variantcalling') bam_recalibrated = inputSample +// input: +// each file(knownIndels) from ch_known_indels -bam_recalibrated = bam_recalibrated.dump(tag:'BAM for Variant Calling') +// output: +// file("${knownIndels}.tbi") into known_indels_tbi -// Here we have a recalibrated bam set -// The TSV file is formatted like: "idPatient status idSample bamFile baiFile" -// Manta will be run in Germline mode, or in Tumor mode depending on status -// HaplotypeCaller, TIDDIT and Strelka will be run for Normal and Tumor samples +// when: !(params.known_indels_index) && params.known_indels && ('mapping' in step || 'preparerecalibration' in step) -(bamMantaSingle, bamStrelkaSingle, bamTIDDIT, bamFreebayesSingleNoIntervals, bamHaplotypeCallerNoIntervals, bamRecalAll) = bam_recalibrated.into(6) +// script: +// """ +// tabix -p vcf ${knownIndels} +// """ +// } -(bam_sentieon_DNAseq, bam_sentieon_DNAscope, bam_sentieon_all) = bam_sentieon_deduped_table.into(3) +// ch_known_indels_tbi = params.known_indels ? params.known_indels_index ? Channel.value(file(params.known_indels_index)) : known_indels_tbi.collect() : "null" -// To speed Variant Callers up we are chopping the reference into smaller pieces -// Do variant calling by this intervals, and re-merge the VCFs +// process BuildPonIndex { +// tag "${pon}" -bamHaplotypeCaller = bamHaplotypeCallerNoIntervals.spread(intHaplotypeCaller) -bamFreebayesSingle = bamFreebayesSingleNoIntervals.spread(intFreebayesSingle) +// publishDir params.outdir, mode: params.publish_dir_mode, +// saveAs: {params.save_reference ? "reference_genome/${it}" : null } -// STEP GATK HAPLOTYPECALLER.1 +// input: +// file(pon) from ch_pon -process HaplotypeCaller { - label 'memory_singleCPU_task_sq' - label 'cpus_2' +// output: +// file("${pon}.tbi") into pon_tbi - tag "${idSample}-${intervalBed.baseName}" +// when: !(params.pon_index) && params.pon && ('tnscope' in tools || 'mutect2' in tools) + +// script: +// """ +// tabix -p vcf ${pon} +// """ +// } + +// ch_pon_tbi = params.pon ? params.pon_index ? Channel.value(file(params.pon_index)) : pon_tbi : "null" + +// process BuildIntervals { +// tag "${fastaFai}" + +// publishDir params.outdir, mode: params.publish_dir_mode, +// saveAs: {params.save_reference ? "reference_genome/${it}" : null } + +// input: +// file(fastaFai) from ch_fai + +// output: +// file("${fastaFai.baseName}.bed") into intervalBuilt + +// when: !(params.intervals) && !('annotate' in step) && !('controlfreec' in step) + +// script: +// """ +// awk -v FS='\t' -v OFS='\t' '{ print \$1, \"0\", \$2 }' ${fastaFai} > ${fastaFai.baseName}.bed +// """ +// } + +// ch_intervals = params.no_intervals ? "null" : params.intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : intervalBuilt + +// /* +// ================================================================================ +// PREPROCESSING +// ================================================================================ +// */ + +// // STEP 0: CREATING INTERVALS FOR PARALLELIZATION (PREPROCESSING AND VARIANT CALLING) + +// process CreateIntervalBeds { +// tag "${intervals}" + +// input: +// file(intervals) from ch_intervals + +// output: +// file '*.bed' into bedIntervals mode flatten + +// when: (!params.no_intervals) && step != 'annotate' + +// script: +// // If the interval file is BED format, the fifth column is interpreted to +// // contain runtime estimates, which is then used to combine short-running jobs +// if (hasExtension(intervals, "bed")) +// """ +// awk -vFS="\t" '{ +// t = \$5 # runtime estimate +// if (t == "") { +// # no runtime estimate in this row, assume default value +// t = (\$3 - \$2) / ${params.nucleotides_per_second} +// } +// if (name == "" || (chunk > 600 && (chunk + t) > longest * 1.05)) { +// # start a new chunk +// name = sprintf("%s_%d-%d.bed", \$1, \$2+1, \$3) +// chunk = 0 +// longest = 0 +// } +// if (t > longest) +// longest = t +// chunk += t +// print \$0 > name +// }' ${intervals} +// """ +// else if (hasExtension(intervals, "interval_list")) +// """ +// grep -v '^@' ${intervals} | awk -vFS="\t" '{ +// name = sprintf("%s_%d-%d", \$1, \$2, \$3); +// printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed" +// }' +// """ +// else +// """ +// awk -vFS="[:-]" '{ +// name = sprintf("%s_%d-%d", \$1, \$2, \$3); +// printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed" +// }' ${intervals} +// """ +// } + +// bedIntervals = bedIntervals +// .map { intervalFile -> +// def duration = 0.0 +// for (line in intervalFile.readLines()) { +// final fields = line.split('\t') +// if (fields.size() >= 5) duration += fields[4].toFloat() +// else { +// start = fields[1].toInteger() +// end = fields[2].toInteger() +// duration += (end - start) / params.nucleotides_per_second +// } +// } +// [duration, intervalFile] +// }.toSortedList({ a, b -> b[0] <=> a[0] }) +// .flatten().collate(2) +// .map{duration, intervalFile -> intervalFile} + +// bedIntervals = bedIntervals.dump(tag:'bedintervals') + +// if (params.no_intervals && step != 'annotate') { +// file("${params.outdir}/no_intervals.bed").text = "no_intervals\n" +// bedIntervals = Channel.from(file("${params.outdir}/no_intervals.bed")) +// } + +// (intBaseRecalibrator, intApplyBQSR, intHaplotypeCaller, intFreebayesSingle, intMpileup, bedIntervals) = bedIntervals.into(6) + +// // PREPARING CHANNELS FOR PREPROCESSING AND QC + +// inputBam = Channel.create() +// inputPairReads = Channel.create() + +// if (step in ['preparerecalibration', 'recalibrate', 'variantcalling', 'controlfreec', 'annotate']) { +// inputBam.close() +// inputPairReads.close() +// } else inputSample.choice(inputPairReads, inputBam) {hasExtension(it[3], "bam") ? 1 : 0} + +// (inputBam, inputBamFastQC) = inputBam.into(2) + +// // Removing inputFile2 which is null in case of uBAM +// inputBamFastQC = inputBamFastQC.map { +// idPatient, idSample, idRun, inputFile1, inputFile2 -> +// [idPatient, idSample, idRun, inputFile1] +// } + +// if (params.split_fastq){ +// inputPairReads = inputPairReads +// // newly splitfastq are named based on split, so the name is easier to catch +// .splitFastq(by: params.split_fastq, compress:true, file:"split", pe:true) +// .map {idPatient, idSample, idRun, reads1, reads2 -> +// // The split fastq read1 is the 4th element (indexed 3) its name is split_3 +// // The split fastq read2's name is split_4 +// // It's followed by which split it's acutally based on the mother fastq file +// // Index start at 1 +// // Extracting the index to get a new IdRun +// splitIndex = reads1.fileName.toString().minus("split_3.").minus(".gz") +// newIdRun = idRun + "_" + splitIndex +// // Giving the files a new nice name +// newReads1 = file("${idSample}_${newIdRun}_R1.fastq.gz") +// newReads2 = file("${idSample}_${newIdRun}_R2.fastq.gz") +// [idPatient, idSample, newIdRun, reads1, reads2]} +// } + +// inputPairReads = inputPairReads.dump(tag:'INPUT') + +// (inputPairReads, inputPairReadsTrimGalore, inputPairReadsFastQC) = inputPairReads.into(3) + +// // STEP 0.5: QC ON READS + +// // TODO: Use only one process for FastQC for FASTQ files and uBAM files +// // FASTQ and uBAM files are renamed based on the sample name + +// process FastQCFQ { +// label 'FastQC' +// label 'cpus_2' + +// tag "${idPatient}-${idRun}" + +// publishDir "${params.outdir}/Reports/${idSample}/FastQC/${idSample}_${idRun}", mode: params.publish_dir_mode + +// input: +// set idPatient, idSample, idRun, file("${idSample}_${idRun}_R1.fastq.gz"), file("${idSample}_${idRun}_R2.fastq.gz") from inputPairReadsFastQC + +// output: +// file("*.{html,zip}") into fastQCFQReport + +// when: !('fastqc' in skipQC) + +// script: +// """ +// fastqc -t 2 -q ${idSample}_${idRun}_R1.fastq.gz ${idSample}_${idRun}_R2.fastq.gz +// """ +// } + +// process FastQCBAM { +// label 'FastQC' +// label 'cpus_2' + +// tag "${idPatient}-${idRun}" + +// publishDir "${params.outdir}/Reports/${idSample}/FastQC/${idSample}_${idRun}", mode: params.publish_dir_mode + +// input: +// set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") from inputBamFastQC + +// output: +// file("*.{html,zip}") into fastQCBAMReport + +// when: !('fastqc' in skipQC) + +// script: +// """ +// fastqc -t 2 -q ${idSample}_${idRun}.bam +// """ +// } + +// fastQCReport = fastQCFQReport.mix(fastQCBAMReport) + +// fastQCReport = fastQCReport.dump(tag:'FastQC') + +// process TrimGalore { +// label 'TrimGalore' + +// tag "${idPatient}-${idRun}" + +// publishDir "${params.outdir}/Reports/${idSample}/TrimGalore/${idSample}_${idRun}", mode: params.publish_dir_mode, +// saveAs: {filename -> +// if (filename.indexOf("_fastqc") > 0) "FastQC/$filename" +// else if (filename.indexOf("trimming_report.txt") > 0) "logs/$filename" +// else if (params.save_trimmed) filename +// else null +// } + +// input: +// set idPatient, idSample, idRun, file("${idSample}_${idRun}_R1.fastq.gz"), file("${idSample}_${idRun}_R2.fastq.gz") from inputPairReadsTrimGalore + +// output: +// file("*.{html,zip,txt}") into trimGaloreReport +// set idPatient, idSample, idRun, file("${idSample}_${idRun}_R1_val_1.fq.gz"), file("${idSample}_${idRun}_R2_val_2.fq.gz") into outputPairReadsTrimGalore + +// when: params.trim_fastq + +// script: +// // Calculate number of --cores for TrimGalore based on value of task.cpus +// // See: https://github.com/FelixKrueger/TrimGalore/blob/master/Changelog.md#version-060-release-on-1-mar-2019 +// // See: https://github.com/nf-core/atacseq/pull/65 +// def cores = 1 +// if (task.cpus) { +// cores = (task.cpus as int) - 4 +// if (cores < 1) cores = 1 +// if (cores > 4) cores = 4 +// } +// c_r1 = params.clip_r1 > 0 ? "--clip_r1 ${params.clip_r1}" : '' +// c_r2 = params.clip_r2 > 0 ? "--clip_r2 ${params.clip_r2}" : '' +// tpc_r1 = params.three_prime_clip_r1 > 0 ? "--three_prime_clip_r1 ${params.three_prime_clip_r1}" : '' +// tpc_r2 = params.three_prime_clip_r2 > 0 ? "--three_prime_clip_r2 ${params.three_prime_clip_r2}" : '' +// nextseq = params.trim_nextseq > 0 ? "--nextseq ${params.trim_nextseq}" : '' +// """ +// trim_galore \ +// --cores ${cores} \ +// --paired \ +// --fastqc \ +// --gzip \ +// ${c_r1} ${c_r2} \ +// ${tpc_r1} ${tpc_r2} \ +// ${nextseq} \ +// ${idSample}_${idRun}_R1.fastq.gz ${idSample}_${idRun}_R2.fastq.gz + +// mv *val_1_fastqc.html "${idSample}_${idRun}_R1.trimmed_fastqc.html" +// mv *val_2_fastqc.html "${idSample}_${idRun}_R2.trimmed_fastqc.html" +// mv *val_1_fastqc.zip "${idSample}_${idRun}_R1.trimmed_fastqc.zip" +// mv *val_2_fastqc.zip "${idSample}_${idRun}_R2.trimmed_fastqc.zip" +// """ +// } + +// if (!params.trim_fastq) inputPairReadsTrimGalore.close() + +// // STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA MEM + +// if (params.trim_fastq) inputPairReads = outputPairReadsTrimGalore +// else inputPairReads = inputPairReads.mix(inputBam) + +// inputPairReads = inputPairReads.dump(tag:'INPUT') + +// (inputPairReads, input_pair_reads_sentieon) = inputPairReads.into(2) +// if (params.sentieon) inputPairReads.close() +// else input_pair_reads_sentieon.close() + +// process MapReads { +// label 'cpus_max' + +// tag "${idPatient}-${idRun}" + +// input: +// set idPatient, idSample, idRun, file(inputFile1), file(inputFile2) from inputPairReads +// file(bwaIndex) from ch_bwa +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai + +// output: +// set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") into bamMapped +// set idPatient, val("${idSample}_${idRun}"), file("${idSample}_${idRun}.bam") into bamMappedBamQC + +// when: !(params.sentieon) + +// script: +// // -K is an hidden option, used to fix the number of reads processed by bwa mem +// // Chunk size can affect bwa results, if not specified, +// // the number of threads can change which can give not deterministic result. +// // cf https://github.com/CCDG/Pipeline-Standardization/blob/master/PipelineStandard.md +// // and https://github.com/gatk-workflows/gatk4-data-processing/blob/8ffa26ff4580df4ac3a5aa9e272a4ff6bab44ba2/processing-for-variant-discovery-gatk4.b37.wgs.inputs.json#L29 +// CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : "" +// readGroup = "@RG\\tID:${idRun}\\t${CN}PU:${idRun}\\tSM:${idSample}\\tLB:${idSample}\\tPL:illumina" +// // adjust mismatch penalty for tumor samples +// status = statusMap[idPatient, idSample] +// extra = status == 1 ? "-B 3" : "" +// convertToFastq = hasExtension(inputFile1, "bam") ? "gatk --java-options -Xmx${task.memory.toGiga()}g SamToFastq --INPUT=${inputFile1} --FASTQ=/dev/stdout --INTERLEAVE=true --NON_PF=true | \\" : "" +// input = hasExtension(inputFile1, "bam") ? "-p /dev/stdin - 2> >(tee ${inputFile1}.bwa.stderr.log >&2)" : "${inputFile1} ${inputFile2}" +// """ +// ${convertToFastq} +// bwa mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${fasta} \ +// ${input} | \ +// samtools sort --threads ${task.cpus} -m 2G - > ${idSample}_${idRun}.bam +// """ +// } + +// bamMapped = bamMapped.dump(tag:'Mapped BAM') +// // Sort BAM whether they are standalone or should be merged + +// singleBam = Channel.create() +// multipleBam = Channel.create() +// bamMapped.groupTuple(by:[0, 1]) +// .choice(singleBam, multipleBam) {it[2].size() > 1 ? 1 : 0} +// singleBam = singleBam.map { +// idPatient, idSample, idRun, bam -> +// [idPatient, idSample, bam] +// } +// singleBam = singleBam.dump(tag:'Single BAM') + +// // STEP 1': MAPPING READS TO REFERENCE GENOME WITH SENTIEON BWA MEM + +// process Sentieon_MapReads { +// label 'cpus_max' +// label 'memory_max' +// label 'sentieon' + +// tag "${idPatient}-${idRun}" + +// input: +// set idPatient, idSample, idRun, file(inputFile1), file(inputFile2) from input_pair_reads_sentieon +// file(bwaIndex) from ch_bwa +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai + +// output: +// set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") into bam_sentieon_mapped + +// when: params.sentieon + +// script: +// // -K is an hidden option, used to fix the number of reads processed by bwa mem +// // Chunk size can affect bwa results, if not specified, +// // the number of threads can change which can give not deterministic result. +// // cf https://github.com/CCDG/Pipeline-Standardization/blob/master/PipelineStandard.md +// // and https://github.com/gatk-workflows/gatk4-data-processing/blob/8ffa26ff4580df4ac3a5aa9e272a4ff6bab44ba2/processing-for-variant-discovery-gatk4.b37.wgs.inputs.json#L29 +// CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : "" +// readGroup = "@RG\\tID:${idRun}\\t${CN}PU:${idRun}\\tSM:${idSample}\\tLB:${idSample}\\tPL:illumina" +// // adjust mismatch penalty for tumor samples +// status = statusMap[idPatient, idSample] +// extra = status == 1 ? "-B 3" : "" +// """ +// sentieon bwa mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${fasta} \ +// ${inputFile1} ${inputFile2} | \ +// sentieon util sort -r ${fasta} -o ${idSample}_${idRun}.bam -t ${task.cpus} --sam2bam -i - +// """ +// } + +// bam_sentieon_mapped = bam_sentieon_mapped.dump(tag:'Sentieon Mapped BAM') +// // Sort BAM whether they are standalone or should be merged + +// singleBamSentieon = Channel.create() +// multipleBamSentieon = Channel.create() +// bam_sentieon_mapped.groupTuple(by:[0, 1]) +// .choice(singleBamSentieon, multipleBamSentieon) {it[2].size() > 1 ? 1 : 0} +// singleBamSentieon = singleBamSentieon.map { +// idPatient, idSample, idRun, bam -> +// [idPatient, idSample, bam] +// } +// singleBamSentieon = singleBamSentieon.dump(tag:'Single BAM') + +// // STEP 1.5: MERGING BAM FROM MULTIPLE LANES + +// multipleBam = multipleBam.mix(multipleBamSentieon) + +// process MergeBamMapped { +// label 'cpus_8' + +// tag "${idPatient}-${idSample}" + +// input: +// set idPatient, idSample, idRun, file(bam) from multipleBam + +// output: +// set idPatient, idSample, file("${idSample}.bam") into bam_mapped_merged + +// script: +// """ +// samtools merge --threads ${task.cpus} ${idSample}.bam ${bam} +// """ +// } + +// bam_mapped_merged = bam_mapped_merged.dump(tag:'Merged BAM') + +// bam_mapped_merged = bam_mapped_merged.mix(singleBam,singleBamSentieon) + +// (bam_mapped_merged, bam_sentieon_mapped_merged) = bam_mapped_merged.into(2) + +// if (!params.sentieon) bam_sentieon_mapped_merged.close() +// else bam_mapped_merged.close() + +// bam_mapped_merged = bam_mapped_merged.dump(tag:'BAMs for MD') +// bam_sentieon_mapped_merged = bam_sentieon_mapped_merged.dump(tag:'Sentieon BAMs to Index') + +// process IndexBamMergedForSentieon { +// label 'cpus_8' + +// tag "${idPatient}-${idSample}" + +// input: +// set idPatient, idSample, file("${idSample}.bam") from bam_sentieon_mapped_merged + +// output: +// set idPatient, idSample, file("${idSample}.bam"), file("${idSample}.bam.bai") into bam_sentieon_mapped_merged_indexed + +// script: +// """ +// samtools index ${idSample}.bam +// """ +// } + +// (bam_mapped_merged, bam_mapped_merged_to_index) = bam_mapped_merged.into(2) + +// process IndexBamFile { +// label 'cpus_8' + +// tag "${idPatient}-${idSample}" + +// publishDir params.outdir, mode: params.publish_dir_mode, +// saveAs: { +// if (save_bam_mapped) "Preprocessing/${idSample}/Mapped/${it}" +// else null +// } + +// input: +// set idPatient, idSample, file("${idSample}.bam") from bam_mapped_merged_to_index + +// output: +// set idPatient, idSample, file("${idSample}.bam"), file("${idSample}.bam.bai") into bam_mapped_merged_indexed +// set idPatient, idSample into tsv_bam_indexed + +// when: save_bam_mapped || !(params.known_indels) + +// script: +// """ +// samtools index ${idSample}.bam +// """ +// } + +// if (!save_bam_mapped) tsv_bam_indexed.close() + +// (tsv_bam_indexed, tsv_bam_indexed_sample) = tsv_bam_indexed.into(2) + +// // Creating a TSV file to restart from this step +// tsv_bam_indexed.map { idPatient, idSample -> +// gender = genderMap[idPatient] +// status = statusMap[idPatient, idSample] +// bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" +// bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" +// "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" +// }.collectFile( +// name: 'mapped.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" +// ) - input: - set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamHaplotypeCaller - file(dbsnp) from ch_dbsnp - file(dbsnpIndex) from ch_dbsnp_tbi - file(dict) from ch_dict - file(fasta) from ch_fasta - file(fastaFai) from ch_fai +// tsv_bam_indexed_sample +// .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> +// status = statusMap[idPatient, idSample] +// gender = genderMap[idPatient] +// bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" +// bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" +// ["mapped_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] +// } +// // STEP 2: MARKING DUPLICATES - output: - set val("HaplotypeCallerGVCF"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.g.vcf") into gvcfHaplotypeCaller - set idPatient, idSample, file(intervalBed), file("${intervalBed.baseName}_${idSample}.g.vcf") into gvcfGenotypeGVCFs +// process MarkDuplicates { +// label 'cpus_16' - when: 'haplotypecaller' in tools +// tag "${idPatient}-${idSample}" - script: - intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" - dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : "" - """ - gatk --java-options "-Xmx${task.memory.toGiga()}g -Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ - HaplotypeCaller \ - -R ${fasta} \ - -I ${bam} \ - ${intervalsOptions} \ - ${dbsnpOptions} \ - -O ${intervalBed.baseName}_${idSample}.g.vcf \ - -ERC GVCF - """ -} +// publishDir params.outdir, mode: params.publish_dir_mode, +// saveAs: { +// if (it == "${idSample}.bam.metrics") "Reports/${idSample}/MarkDuplicates/${it}" +// else "Preprocessing/${idSample}/DuplicatesMarked/${it}" +// } + +// input: +// set idPatient, idSample, file("${idSample}.bam") from bam_mapped_merged + +// output: +// set idPatient, idSample, file("${idSample}.md.bam"), file("${idSample}.md.bam.bai") into bam_duplicates_marked +// set idPatient, idSample into tsv_bam_duplicates_marked +// file ("${idSample}.bam.metrics") optional true into duplicates_marked_report + +// when: !(params.skip_markduplicates) + +// script: +// markdup_java_options = task.memory.toGiga() > 8 ? params.markdup_java_options : "\"-Xms" + (task.memory.toGiga() / 2).trunc() + "g -Xmx" + (task.memory.toGiga() - 1) + "g\"" +// metrics = 'markduplicates' in skipQC ? '' : "-M ${idSample}.bam.metrics" +// if (params.no_gatk_spark) +// """ +// gatk --java-options ${markdup_java_options} \ +// MarkDuplicates \ +// --MAX_RECORDS_IN_RAM 50000 \ +// --INPUT ${idSample}.bam \ +// --METRICS_FILE ${idSample}.bam.metrics \ +// --TMP_DIR . \ +// --ASSUME_SORT_ORDER coordinate \ +// --CREATE_INDEX true \ +// --OUTPUT ${idSample}.md.bam + +// mv ${idSample}.md.bai ${idSample}.md.bam.bai +// """ +// else +// """ +// gatk --java-options ${markdup_java_options} \ +// MarkDuplicatesSpark \ +// -I ${idSample}.bam \ +// -O ${idSample}.md.bam \ +// ${metrics} \ +// --tmp-dir . \ +// --create-output-bam-index true \ +// --spark-master local[${task.cpus}] +// """ +// } + +// (tsv_bam_duplicates_marked, tsv_bam_duplicates_marked_sample) = tsv_bam_duplicates_marked.into(2) + +// // Creating a TSV file to restart from this step +// tsv_bam_duplicates_marked.map { idPatient, idSample -> +// gender = genderMap[idPatient] +// status = statusMap[idPatient, idSample] +// bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" +// bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" +// "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" +// }.collectFile( +// name: 'duplicates_marked_no_table.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" +// ) + +// tsv_bam_duplicates_marked_sample +// .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> +// status = statusMap[idPatient, idSample] +// gender = genderMap[idPatient] +// bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" +// bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" +// ["duplicates_marked_no_table_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] +// } + +// if ('markduplicates' in skipQC) duplicates_marked_report.close() + +// if (step == 'preparerecalibration') bam_duplicates_marked = inputSample + +// bam_duplicates_marked = bam_duplicates_marked.dump(tag:'MD BAM') +// duplicates_marked_report = duplicates_marked_report.dump(tag:'MD Report') + +// if (params.skip_markduplicates) bam_duplicates_marked = bam_mapped_merged_indexed + +// (bamMD, bamMDToJoin, bam_duplicates_marked) = bam_duplicates_marked.into(3) + +// bamBaseRecalibrator = bamMD.combine(intBaseRecalibrator) + +// bamBaseRecalibrator = bamBaseRecalibrator.dump(tag:'BAM FOR BASERECALIBRATOR') + +// // STEP 2': SENTIEON DEDUP + +// process Sentieon_Dedup { +// label 'cpus_max' +// label 'memory_max' +// label 'sentieon' + +// tag "${idPatient}-${idSample}" + +// publishDir params.outdir, mode: params.publish_dir_mode, +// saveAs: { +// if (it == "${idSample}_*.txt" && 'sentieon' in skipQC) null +// else if (it == "${idSample}_*.txt") "Reports/${idSample}/Sentieon/${it}" +// else "Preprocessing/${idSample}/DedupedSentieon/${it}" +// } + +// input: +// set idPatient, idSample, file(bam), file(bai) from bam_sentieon_mapped_merged_indexed +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai + +// output: +// set idPatient, idSample, file("${idSample}.deduped.bam"), file("${idSample}.deduped.bam.bai") into bam_sentieon_dedup + +// when: params.sentieon + +// script: +// """ +// sentieon driver \ +// -t ${task.cpus} \ +// -i ${bam} \ +// -r ${fasta} \ +// --algo GCBias --summary ${idSample}_gc_summary.txt ${idSample}_gc_metric.txt \ +// --algo MeanQualityByCycle ${idSample}_mq_metric.txt \ +// --algo QualDistribution ${idSample}_qd_metric.txt \ +// --algo InsertSizeMetricAlgo ${idSample}_is_metric.txt \ +// --algo AlignmentStat ${idSample}_aln_metric.txt + +// sentieon driver \ +// -t ${task.cpus} \ +// -i ${bam} \ +// --algo LocusCollector \ +// --fun score_info ${idSample}_score.gz + +// sentieon driver \ +// -t ${task.cpus} \ +// -i ${bam} \ +// --algo Dedup \ +// --rmdup \ +// --score_info ${idSample}_score.gz \ +// --metrics ${idSample}_dedup_metric.txt ${idSample}.deduped.bam +// """ +// } + +// // STEP 3: CREATING RECALIBRATION TABLES + +// process BaseRecalibrator { +// label 'cpus_1' + +// tag "${idPatient}-${idSample}-${intervalBed.baseName}" + +// input: +// set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamBaseRecalibrator +// file(dbsnp) from ch_dbsnp +// file(dbsnpIndex) from ch_dbsnp_tbi +// file(fasta) from ch_fasta +// file(dict) from ch_dict +// file(fastaFai) from ch_fai +// file(knownIndels) from ch_known_indels +// file(knownIndelsIndex) from ch_known_indels_tbi + +// output: +// set idPatient, idSample, file("${prefix}${idSample}.recal.table") into tableGatherBQSRReports +// set idPatient, idSample into recalTableTSVnoInt + +// when: params.known_indels + +// script: +// dbsnpOptions = params.dbsnp ? "--known-sites ${dbsnp}" : "" +// knownOptions = params.known_indels ? knownIndels.collect{"--known-sites ${it}"}.join(' ') : "" +// prefix = params.no_intervals ? "" : "${intervalBed.baseName}_" +// intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" +// // TODO: --use-original-qualities ??? +// """ +// gatk --java-options -Xmx${task.memory.toGiga()}g \ +// BaseRecalibrator \ +// -I ${bam} \ +// -O ${prefix}${idSample}.recal.table \ +// --tmp-dir . \ +// -R ${fasta} \ +// ${intervalsOptions} \ +// ${dbsnpOptions} \ +// ${knownOptions} \ +// --verbosity INFO +// """ +// } + +// if (!params.no_intervals) tableGatherBQSRReports = tableGatherBQSRReports.groupTuple(by:[0, 1]) + +// tableGatherBQSRReports = tableGatherBQSRReports.dump(tag:'BQSR REPORTS') + +// if (params.no_intervals) { +// (tableGatherBQSRReports, tableGatherBQSRReportsNoInt) = tableGatherBQSRReports.into(2) +// recalTable = tableGatherBQSRReportsNoInt +// } else recalTableTSVnoInt.close() + +// // STEP 3.5: MERGING RECALIBRATION TABLES + +// process GatherBQSRReports { +// label 'memory_singleCPU_2_task' +// label 'cpus_2' + +// tag "${idPatient}-${idSample}" + +// publishDir params.outdir, mode: params.publish_dir_mode, +// saveAs: { +// if (it == "${idSample}.recal.table" && !params.skip_markduplicates) "Preprocessing/${idSample}/DuplicatesMarked/${it}" +// else "Preprocessing/${idSample}/Mapped/${it}" +// } + +// input: +// set idPatient, idSample, file(recal) from tableGatherBQSRReports + +// output: +// set idPatient, idSample, file("${idSample}.recal.table") into recalTable +// file("${idSample}.recal.table") into baseRecalibratorReport +// set idPatient, idSample into recalTableTSV + +// when: !(params.no_intervals) + +// script: +// input = recal.collect{"-I ${it}"}.join(' ') +// """ +// gatk --java-options -Xmx${task.memory.toGiga()}g \ +// GatherBQSRReports \ +// ${input} \ +// -O ${idSample}.recal.table \ +// """ +// } + +// if ('baserecalibrator' in skipQC) baseRecalibratorReport.close() + +// recalTable = recalTable.dump(tag:'RECAL TABLE') + +// (recalTableTSV, recalTableSampleTSV) = recalTableTSV.mix(recalTableTSVnoInt).into(2) + +// // Create TSV files to restart from this step +// if (params.skip_markduplicates) { +// recalTableTSV.map { idPatient, idSample -> +// status = statusMap[idPatient, idSample] +// gender = genderMap[idPatient] +// bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" +// bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" +// recalTable = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.recal.table" +// "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n" +// }.collectFile( +// name: 'mapped_no_duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" +// ) + +// recalTableSampleTSV +// .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV/") { +// idPatient, idSample -> +// status = statusMap[idPatient, idSample] +// gender = genderMap[idPatient] +// bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" +// bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" +// recalTable = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.recal.table" +// ["mapped_no_duplicates_marked_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n"] +// } +// } else { +// recalTableTSV.map { idPatient, idSample -> +// status = statusMap[idPatient, idSample] +// gender = genderMap[idPatient] +// bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" +// bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" +// recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.recal.table" + +// "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n" +// }.collectFile( +// name: 'duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" +// ) + +// recalTableSampleTSV +// .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV/") { +// idPatient, idSample -> +// status = statusMap[idPatient, idSample] +// gender = genderMap[idPatient] +// bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" +// bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" +// recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.recal.table" +// ["duplicates_marked_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n"] +// } +// } + +// bamApplyBQSR = bamMDToJoin.join(recalTable, by:[0,1]) + +// if (step == 'recalibrate') bamApplyBQSR = inputSample + +// bamApplyBQSR = bamApplyBQSR.dump(tag:'BAM + BAI + RECAL TABLE') + +// bamApplyBQSR = bamApplyBQSR.combine(intApplyBQSR) + +// bamApplyBQSR = bamApplyBQSR.dump(tag:'BAM + BAI + RECAL TABLE + INT') + +// // STEP 4: RECALIBRATING + +// process ApplyBQSR { +// label 'memory_singleCPU_2_task' +// label 'cpus_2' + +// tag "${idPatient}-${idSample}-${intervalBed.baseName}" + +// input: +// set idPatient, idSample, file(bam), file(bai), file(recalibrationReport), file(intervalBed) from bamApplyBQSR +// file(dict) from ch_dict +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai + +// output: +// set idPatient, idSample, file("${prefix}${idSample}.recal.bam") into bam_recalibrated_to_merge + +// script: +// prefix = params.no_intervals ? "" : "${intervalBed.baseName}_" +// intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" +// """ +// gatk --java-options -Xmx${task.memory.toGiga()}g \ +// ApplyBQSR \ +// -R ${fasta} \ +// --input ${bam} \ +// --output ${prefix}${idSample}.recal.bam \ +// ${intervalsOptions} \ +// --bqsr-recal-file ${recalibrationReport} +// """ +// } + +// (bam_recalibrated_to_merge, bam_recalibrated_to_index) = bam_recalibrated_to_merge.groupTuple(by:[0, 1]).into(2) + +// // STEP 4': SENTIEON BQSR + +// bam_sentieon_dedup = bam_sentieon_dedup.dump(tag:'deduped.bam') + +// process Sentieon_BQSR { +// label 'cpus_max' +// label 'memory_max' +// label 'sentieon' + +// tag "${idPatient}-${idSample}" + +// publishDir params.outdir, mode: params.publish_dir_mode, +// saveAs: { +// if (it == "${idSample}_recal_result.csv" && 'sentieon' in skipQC) "Reports/${idSample}/Sentieon/${it}" +// else "Preprocessing/${idSample}/RecalSentieon/${it}" +// } + +// input: +// set idPatient, idSample, file(bam), file(bai) from bam_sentieon_dedup +// file(dbsnp) from ch_dbsnp +// file(dbsnpIndex) from ch_dbsnp_tbi +// file(fasta) from ch_fasta +// file(dict) from ch_dict +// file(fastaFai) from ch_fai +// file(knownIndels) from ch_known_indels +// file(knownIndelsIndex) from ch_known_indels_tbi + +// output: +// set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bam_sentieon_recal +// set idPatient, idSample, file(bam), file(bai), file("${idSample}.recal.table") into bam_sentieon_deduped_table +// set idPatient, idSample into tsv_sentieon + +// when: params.sentieon + +// script: +// known = knownIndels.collect{"--known-sites ${it}"}.join(' ') +// """ +// sentieon driver \ +// -t ${task.cpus} \ +// -r ${fasta} \ +// -i ${idSample}.deduped.bam \ +// --algo QualCal \ +// -k ${dbsnp} \ +// ${idSample}.recal.table + +// sentieon driver \ +// -t ${task.cpus} \ +// -r ${fasta} \ +// -i ${idSample}.deduped.bam \ +// -q ${idSample}.recal.table \ +// --algo QualCal \ +// -k ${dbsnp} \ +// ${idSample}.table.post \ +// --algo ReadWriter ${idSample}.recal.bam + +// sentieon driver \ +// -t ${task.cpus} \ +// --algo QualCal \ +// --plot \ +// --before ${idSample}.recal.table \ +// --after ${idSample}.table.post \ +// ${idSample}_recal_result.csv +// """ +// } + +// (tsv_sentieon_deduped, tsv_sentieon_deduped_sample, tsv_sentieon_recal, tsv_sentieon_recal_sample) = tsv_sentieon.into(4) + +// // Creating a TSV file to restart from this step +// tsv_sentieon_deduped.map { idPatient, idSample -> +// gender = genderMap[idPatient] +// status = statusMap[idPatient, idSample] +// bam = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam" +// bai = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam.bai" +// table = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.table" +// "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${table}\n" +// }.collectFile( +// name: 'sentieon_deduped.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" +// ) + +// tsv_sentieon_deduped_sample +// .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> +// status = statusMap[idPatient, idSample] +// gender = genderMap[idPatient] +// bam = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam" +// bai = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam.bai" +// table = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.table" +// ["sentieon_deduped_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${table}\n"] +// } + +// // Creating a TSV file to restart from this step +// tsv_sentieon_recal.map { idPatient, idSample -> +// gender = genderMap[idPatient] +// status = statusMap[idPatient, idSample] +// bam = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam" +// bai = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam.bai" +// "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" +// }.collectFile( +// name: 'sentieon_recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" +// ) + +// tsv_sentieon_recal_sample +// .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> +// status = statusMap[idPatient, idSample] +// gender = genderMap[idPatient] +// bam = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam" +// bai = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam.bai" +// ["sentieon_recalibrated_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] +// } + +// // STEP 4.5: MERGING THE RECALIBRATED BAM FILES + +// process MergeBamRecal { +// label 'cpus_8' + +// tag "${idPatient}-${idSample}" + +// publishDir "${params.outdir}/Preprocessing/${idSample}/Recalibrated", mode: params.publish_dir_mode + +// input: +// set idPatient, idSample, file(bam) from bam_recalibrated_to_merge + +// output: +// set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bam_recalibrated +// set idPatient, idSample, file("${idSample}.recal.bam") into bam_recalibrated_qc +// set idPatient, idSample into tsv_bam_recalibrated + +// when: !(params.no_intervals) + +// script: +// """ +// samtools merge --threads ${task.cpus} ${idSample}.recal.bam ${bam} +// samtools index ${idSample}.recal.bam +// """ +// } + +// // STEP 4.5': INDEXING THE RECALIBRATED BAM FILES + +// process IndexBamRecal { +// label 'cpus_8' + +// tag "${idPatient}-${idSample}" + +// publishDir "${params.outdir}/Preprocessing/${idSample}/Recalibrated", mode: params.publish_dir_mode + +// input: +// set idPatient, idSample, file("${idSample}.recal.bam") from bam_recalibrated_to_index + +// output: +// set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bam_recalibrated_indexed +// set idPatient, idSample, file("${idSample}.recal.bam") into bam_recalibrated_no_int_qc +// set idPatient, idSample into tsv_bam_recalibrated_no_int + +// when: params.no_intervals + +// script: +// """ +// samtools index ${idSample}.recal.bam +// """ +// } + +// bam_recalibrated = bam_recalibrated.mix(bam_recalibrated_indexed) +// bam_recalibrated_qc = bam_recalibrated_qc.mix(bam_recalibrated_no_int_qc) +// tsv_bam_recalibrated = tsv_bam_recalibrated.mix(tsv_bam_recalibrated_no_int) + +// (bam_recalibrated_bamqc, bam_recalibrated_samtools_stats) = bam_recalibrated_qc.into(2) +// (tsv_bam_recalibrated, tsv_bam_recalibrated_sample) = tsv_bam_recalibrated.into(2) + +// // Creating a TSV file to restart from this step +// tsv_bam_recalibrated.map { idPatient, idSample -> +// gender = genderMap[idPatient] +// status = statusMap[idPatient, idSample] +// bam = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam" +// bai = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam.bai" +// "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" +// }.collectFile( +// name: 'recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" +// ) -gvcfHaplotypeCaller = gvcfHaplotypeCaller.groupTuple(by:[0, 1, 2]) - -if (params.no_gvcf) gvcfHaplotypeCaller.close() -else gvcfHaplotypeCaller = gvcfHaplotypeCaller.dump(tag:'GVCF HaplotypeCaller') - -// STEP GATK HAPLOTYPECALLER.2 - -process GenotypeGVCFs { - tag "${idSample}-${intervalBed.baseName}" - - input: - set idPatient, idSample, file(intervalBed), file(gvcf) from gvcfGenotypeGVCFs - file(dbsnp) from ch_dbsnp - file(dbsnpIndex) from ch_dbsnp_tbi - file(dict) from ch_dict - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - - output: - set val("HaplotypeCaller"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.vcf") into vcfGenotypeGVCFs - - when: 'haplotypecaller' in tools - - script: - // Using -L is important for speed and we have to index the interval files also - intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" - dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : "" - """ - gatk --java-options -Xmx${task.memory.toGiga()}g \ - IndexFeatureFile \ - -I ${gvcf} - - gatk --java-options -Xmx${task.memory.toGiga()}g \ - GenotypeGVCFs \ - -R ${fasta} \ - ${intervalsOptions} \ - ${dbsnpOptions} \ - -V ${gvcf} \ - -O ${intervalBed.baseName}_${idSample}.vcf - """ -} +// tsv_bam_recalibrated_sample +// .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { +// idPatient, idSample -> +// status = statusMap[idPatient, idSample] +// gender = genderMap[idPatient] +// bam = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam" +// bai = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam.bai" +// ["recalibrated_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] +// } -vcfGenotypeGVCFs = vcfGenotypeGVCFs.groupTuple(by:[0, 1, 2]) +// // STEP 5: QC -// STEP SENTIEON DNAseq +// process SamtoolsStats { +// label 'cpus_2' -process Sentieon_DNAseq { - label 'cpus_max' - label 'memory_max' - label 'sentieon' +// tag "${idPatient}-${idSample}" - tag "${idSample}" +// publishDir "${params.outdir}/Reports/${idSample}/SamToolsStats", mode: params.publish_dir_mode - input: - set idPatient, idSample, file(bam), file(bai), file(recal) from bam_sentieon_DNAseq - file(dbsnp) from ch_dbsnp - file(dbsnpIndex) from ch_dbsnp_tbi - file(fasta) from ch_fasta - file(fastaFai) from ch_fai +// input: +// set idPatient, idSample, file(bam) from bam_recalibrated_samtools_stats - output: - set val("SentieonDNAseq"), idPatient, idSample, file("DNAseq_${idSample}.vcf") into vcf_sentieon_DNAseq +// output: +// file ("${bam}.samtools.stats.out") into samtoolsStatsReport - when: 'dnaseq' in tools && params.sentieon +// when: !('samtools' in skipQC) - script: - """ - sentieon driver \ - -t ${task.cpus} \ - -r ${fasta} \ - -i ${bam} \ - -q ${recal} \ - --algo Haplotyper \ - -d ${dbsnp} \ - DNAseq_${idSample}.vcf - """ -} +// script: +// """ +// samtools stats ${bam} > ${bam}.samtools.stats.out +// """ +// } -vcf_sentieon_DNAseq = vcf_sentieon_DNAseq.dump(tag:'sentieon DNAseq') - -// STEP SENTIEON DNAscope - -process Sentieon_DNAscope { - label 'cpus_max' - label 'memory_max' - label 'sentieon' - - tag "${idSample}" - - input: - set idPatient, idSample, file(bam), file(bai), file(recal) from bam_sentieon_DNAscope - file(dbsnp) from ch_dbsnp - file(dbsnpIndex) from ch_dbsnp_tbi - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - - output: - set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_${idSample}.vcf") into vcf_sentieon_DNAscope - set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_SV_${idSample}.vcf") into vcf_sentieon_DNAscope_SV - - when: 'dnascope' in tools && params.sentieon - - script: - """ - sentieon driver \ - -t ${task.cpus} \ - -r ${fasta} \ - -i ${bam} \ - -q ${recal} \ - --algo DNAscope \ - -d ${dbsnp} \ - DNAscope_${idSample}.vcf - - sentieon driver \ - -t ${task.cpus} \ - -r ${fasta}\ - -i ${bam} \ - -q ${recal} \ - --algo DNAscope \ - --var_type bnd \ - -d ${dbsnp} \ - DNAscope_${idSample}.temp.vcf - - sentieon driver \ - -t ${task.cpus} \ - -r ${fasta}\ - -q ${recal} \ - --algo SVSolver \ - -v DNAscope_${idSample}.temp.vcf \ - DNAscope_SV_${idSample}.vcf - """ -} +// samtoolsStatsReport = samtoolsStatsReport.dump(tag:'SAMTools') -vcf_sentieon_DNAscope = vcf_sentieon_DNAscope.dump(tag:'sentieon DNAscope') -vcf_sentieon_DNAscope_SV = vcf_sentieon_DNAscope_SV.dump(tag:'sentieon DNAscope SV') +// bamBamQC = bamMappedBamQC.mix(bam_recalibrated_bamqc) -// STEP STRELKA.1 - SINGLE MODE +// process BamQC { +// label 'memory_max' +// label 'cpus_16' -process StrelkaSingle { - label 'cpus_max' - label 'memory_max' +// tag "${idPatient}-${idSample}" - tag "${idSample}" +// publishDir "${params.outdir}/Reports/${idSample}/bamQC", mode: params.publish_dir_mode - publishDir "${params.outdir}/VariantCalling/${idSample}/Strelka", mode: params.publish_dir_mode +// input: +// set idPatient, idSample, file(bam) from bamBamQC +// file(targetBED) from ch_target_bed - input: - set idPatient, idSample, file(bam), file(bai) from bamStrelkaSingle - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - file(targetBED) from ch_target_bed +// output: +// file("${bam.baseName}") into bamQCReport - output: - set val("Strelka"), idPatient, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfStrelkaSingle +// when: !('bamqc' in skipQC) - when: 'strelka' in tools +// script: +// use_bed = params.target_bed ? "-gff ${targetBED}" : '' +// """ +// qualimap --java-mem-size=${task.memory.toGiga()}G \ +// bamqc \ +// -bam ${bam} \ +// --paint-chromosome-limits \ +// --genome-gc-distr HUMAN \ +// $use_bed \ +// -nt ${task.cpus} \ +// -skip-duplicated \ +// --skip-dup-mode 0 \ +// -outdir ${bam.baseName} \ +// -outformat HTML +// """ +// } + +// bamQCReport = bamQCReport.dump(tag:'BamQC') + +// /* +// ================================================================================ +// GERMLINE VARIANT CALLING +// ================================================================================ +// */ + +// // When using sentieon for mapping, Channel bam_recalibrated is bam_sentieon_recal +// if (params.sentieon && step == 'mapping') bam_recalibrated = bam_sentieon_recal + +// // When no knownIndels for mapping, Channel bam_recalibrated is bam_duplicates_marked +// if (!params.known_indels && step == 'mapping') bam_recalibrated = bam_duplicates_marked + +// // When starting with variant calling, Channel bam_recalibrated is inputSample +// if (step == 'variantcalling') bam_recalibrated = inputSample + +// bam_recalibrated = bam_recalibrated.dump(tag:'BAM for Variant Calling') + +// // Here we have a recalibrated bam set +// // The TSV file is formatted like: "idPatient status idSample bamFile baiFile" +// // Manta will be run in Germline mode, or in Tumor mode depending on status +// // HaplotypeCaller, TIDDIT and Strelka will be run for Normal and Tumor samples - script: - beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" - options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" - """ - ${beforeScript} - configureStrelkaGermlineWorkflow.py \ - --bam ${bam} \ - --referenceFasta ${fasta} \ - ${options} \ - --runDir Strelka +// (bamMantaSingle, bamStrelkaSingle, bamTIDDIT, bamFreebayesSingleNoIntervals, bamHaplotypeCallerNoIntervals, bamRecalAll) = bam_recalibrated.into(6) - python Strelka/runWorkflow.py -m local -j ${task.cpus} +// (bam_sentieon_DNAseq, bam_sentieon_DNAscope, bam_sentieon_all) = bam_sentieon_deduped_table.into(3) - mv Strelka/results/variants/genome.*.vcf.gz \ - Strelka_${idSample}_genome.vcf.gz - mv Strelka/results/variants/genome.*.vcf.gz.tbi \ - Strelka_${idSample}_genome.vcf.gz.tbi - mv Strelka/results/variants/variants.vcf.gz \ - Strelka_${idSample}_variants.vcf.gz - mv Strelka/results/variants/variants.vcf.gz.tbi \ - Strelka_${idSample}_variants.vcf.gz.tbi - """ -} +// // To speed Variant Callers up we are chopping the reference into smaller pieces +// // Do variant calling by this intervals, and re-merge the VCFs -vcfStrelkaSingle = vcfStrelkaSingle.dump(tag:'Strelka - Single Mode') - -// STEP MANTA.1 - SINGLE MODE - -process MantaSingle { - label 'cpus_max' - label 'memory_max' - - tag "${idSample}" - - publishDir "${params.outdir}/VariantCalling/${idSample}/Manta", mode: params.publish_dir_mode - - input: - set idPatient, idSample, file(bam), file(bai) from bamMantaSingle - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - file(targetBED) from ch_target_bed - - output: - set val("Manta"), idPatient, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfMantaSingle - - when: 'manta' in tools - - script: - beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" - options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" - status = statusMap[idPatient, idSample] - inputbam = status == 0 ? "--bam" : "--tumorBam" - vcftype = status == 0 ? "diploid" : "tumor" - """ - ${beforeScript} - configManta.py \ - ${inputbam} ${bam} \ - --reference ${fasta} \ - ${options} \ - --runDir Manta - - python Manta/runWorkflow.py -m local -j ${task.cpus} - - mv Manta/results/variants/candidateSmallIndels.vcf.gz \ - Manta_${idSample}.candidateSmallIndels.vcf.gz - mv Manta/results/variants/candidateSmallIndels.vcf.gz.tbi \ - Manta_${idSample}.candidateSmallIndels.vcf.gz.tbi - mv Manta/results/variants/candidateSV.vcf.gz \ - Manta_${idSample}.candidateSV.vcf.gz - mv Manta/results/variants/candidateSV.vcf.gz.tbi \ - Manta_${idSample}.candidateSV.vcf.gz.tbi - mv Manta/results/variants/${vcftype}SV.vcf.gz \ - Manta_${idSample}.${vcftype}SV.vcf.gz - mv Manta/results/variants/${vcftype}SV.vcf.gz.tbi \ - Manta_${idSample}.${vcftype}SV.vcf.gz.tbi - """ -} +// bamHaplotypeCaller = bamHaplotypeCallerNoIntervals.spread(intHaplotypeCaller) +// bamFreebayesSingle = bamFreebayesSingleNoIntervals.spread(intFreebayesSingle) -vcfMantaSingle = vcfMantaSingle.dump(tag:'Single Manta') +// // STEP GATK HAPLOTYPECALLER.1 -// STEP TIDDIT +// process HaplotypeCaller { +// label 'memory_singleCPU_task_sq' +// label 'cpus_2' -process TIDDIT { - tag "${idSample}" +// tag "${idSample}-${intervalBed.baseName}" - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { - if (it == "TIDDIT_${idSample}.vcf") "VariantCalling/${idSample}/TIDDIT/${it}" - else "Reports/${idSample}/TIDDIT/${it}" - } +// input: +// set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamHaplotypeCaller +// file(dbsnp) from ch_dbsnp +// file(dbsnpIndex) from ch_dbsnp_tbi +// file(dict) from ch_dict +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai + +// output: +// set val("HaplotypeCallerGVCF"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.g.vcf") into gvcfHaplotypeCaller +// set idPatient, idSample, file(intervalBed), file("${intervalBed.baseName}_${idSample}.g.vcf") into gvcfGenotypeGVCFs + +// when: 'haplotypecaller' in tools + +// script: +// intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" +// dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : "" +// """ +// gatk --java-options "-Xmx${task.memory.toGiga()}g -Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ +// HaplotypeCaller \ +// -R ${fasta} \ +// -I ${bam} \ +// ${intervalsOptions} \ +// ${dbsnpOptions} \ +// -O ${intervalBed.baseName}_${idSample}.g.vcf \ +// -ERC GVCF +// """ +// } + +// gvcfHaplotypeCaller = gvcfHaplotypeCaller.groupTuple(by:[0, 1, 2]) + +// if (params.no_gvcf) gvcfHaplotypeCaller.close() +// else gvcfHaplotypeCaller = gvcfHaplotypeCaller.dump(tag:'GVCF HaplotypeCaller') + +// // STEP GATK HAPLOTYPECALLER.2 + +// process GenotypeGVCFs { +// tag "${idSample}-${intervalBed.baseName}" + +// input: +// set idPatient, idSample, file(intervalBed), file(gvcf) from gvcfGenotypeGVCFs +// file(dbsnp) from ch_dbsnp +// file(dbsnpIndex) from ch_dbsnp_tbi +// file(dict) from ch_dict +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai + +// output: +// set val("HaplotypeCaller"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.vcf") into vcfGenotypeGVCFs + +// when: 'haplotypecaller' in tools + +// script: +// // Using -L is important for speed and we have to index the interval files also +// intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" +// dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : "" +// """ +// gatk --java-options -Xmx${task.memory.toGiga()}g \ +// IndexFeatureFile \ +// -I ${gvcf} + +// gatk --java-options -Xmx${task.memory.toGiga()}g \ +// GenotypeGVCFs \ +// -R ${fasta} \ +// ${intervalsOptions} \ +// ${dbsnpOptions} \ +// -V ${gvcf} \ +// -O ${intervalBed.baseName}_${idSample}.vcf +// """ +// } + +// vcfGenotypeGVCFs = vcfGenotypeGVCFs.groupTuple(by:[0, 1, 2]) + +// // STEP SENTIEON DNAseq + +// process Sentieon_DNAseq { +// label 'cpus_max' +// label 'memory_max' +// label 'sentieon' + +// tag "${idSample}" + +// input: +// set idPatient, idSample, file(bam), file(bai), file(recal) from bam_sentieon_DNAseq +// file(dbsnp) from ch_dbsnp +// file(dbsnpIndex) from ch_dbsnp_tbi +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai + +// output: +// set val("SentieonDNAseq"), idPatient, idSample, file("DNAseq_${idSample}.vcf") into vcf_sentieon_DNAseq + +// when: 'dnaseq' in tools && params.sentieon + +// script: +// """ +// sentieon driver \ +// -t ${task.cpus} \ +// -r ${fasta} \ +// -i ${bam} \ +// -q ${recal} \ +// --algo Haplotyper \ +// -d ${dbsnp} \ +// DNAseq_${idSample}.vcf +// """ +// } + +// vcf_sentieon_DNAseq = vcf_sentieon_DNAseq.dump(tag:'sentieon DNAseq') + +// // STEP SENTIEON DNAscope + +// process Sentieon_DNAscope { +// label 'cpus_max' +// label 'memory_max' +// label 'sentieon' + +// tag "${idSample}" + +// input: +// set idPatient, idSample, file(bam), file(bai), file(recal) from bam_sentieon_DNAscope +// file(dbsnp) from ch_dbsnp +// file(dbsnpIndex) from ch_dbsnp_tbi +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai + +// output: +// set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_${idSample}.vcf") into vcf_sentieon_DNAscope +// set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_SV_${idSample}.vcf") into vcf_sentieon_DNAscope_SV + +// when: 'dnascope' in tools && params.sentieon + +// script: +// """ +// sentieon driver \ +// -t ${task.cpus} \ +// -r ${fasta} \ +// -i ${bam} \ +// -q ${recal} \ +// --algo DNAscope \ +// -d ${dbsnp} \ +// DNAscope_${idSample}.vcf + +// sentieon driver \ +// -t ${task.cpus} \ +// -r ${fasta}\ +// -i ${bam} \ +// -q ${recal} \ +// --algo DNAscope \ +// --var_type bnd \ +// -d ${dbsnp} \ +// DNAscope_${idSample}.temp.vcf + +// sentieon driver \ +// -t ${task.cpus} \ +// -r ${fasta}\ +// -q ${recal} \ +// --algo SVSolver \ +// -v DNAscope_${idSample}.temp.vcf \ +// DNAscope_SV_${idSample}.vcf +// """ +// } + +// vcf_sentieon_DNAscope = vcf_sentieon_DNAscope.dump(tag:'sentieon DNAscope') +// vcf_sentieon_DNAscope_SV = vcf_sentieon_DNAscope_SV.dump(tag:'sentieon DNAscope SV') + +// // STEP STRELKA.1 - SINGLE MODE + +// process StrelkaSingle { +// label 'cpus_max' +// label 'memory_max' + +// tag "${idSample}" + +// publishDir "${params.outdir}/VariantCalling/${idSample}/Strelka", mode: params.publish_dir_mode + +// input: +// set idPatient, idSample, file(bam), file(bai) from bamStrelkaSingle +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai +// file(targetBED) from ch_target_bed + +// output: +// set val("Strelka"), idPatient, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfStrelkaSingle + +// when: 'strelka' in tools + +// script: +// beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" +// options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" +// """ +// ${beforeScript} +// configureStrelkaGermlineWorkflow.py \ +// --bam ${bam} \ +// --referenceFasta ${fasta} \ +// ${options} \ +// --runDir Strelka + +// python Strelka/runWorkflow.py -m local -j ${task.cpus} + +// mv Strelka/results/variants/genome.*.vcf.gz \ +// Strelka_${idSample}_genome.vcf.gz +// mv Strelka/results/variants/genome.*.vcf.gz.tbi \ +// Strelka_${idSample}_genome.vcf.gz.tbi +// mv Strelka/results/variants/variants.vcf.gz \ +// Strelka_${idSample}_variants.vcf.gz +// mv Strelka/results/variants/variants.vcf.gz.tbi \ +// Strelka_${idSample}_variants.vcf.gz.tbi +// """ +// } + +// vcfStrelkaSingle = vcfStrelkaSingle.dump(tag:'Strelka - Single Mode') + +// // STEP MANTA.1 - SINGLE MODE + +// process MantaSingle { +// label 'cpus_max' +// label 'memory_max' + +// tag "${idSample}" + +// publishDir "${params.outdir}/VariantCalling/${idSample}/Manta", mode: params.publish_dir_mode + +// input: +// set idPatient, idSample, file(bam), file(bai) from bamMantaSingle +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai +// file(targetBED) from ch_target_bed + +// output: +// set val("Manta"), idPatient, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfMantaSingle + +// when: 'manta' in tools + +// script: +// beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" +// options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" +// status = statusMap[idPatient, idSample] +// inputbam = status == 0 ? "--bam" : "--tumorBam" +// vcftype = status == 0 ? "diploid" : "tumor" +// """ +// ${beforeScript} +// configManta.py \ +// ${inputbam} ${bam} \ +// --reference ${fasta} \ +// ${options} \ +// --runDir Manta + +// python Manta/runWorkflow.py -m local -j ${task.cpus} + +// mv Manta/results/variants/candidateSmallIndels.vcf.gz \ +// Manta_${idSample}.candidateSmallIndels.vcf.gz +// mv Manta/results/variants/candidateSmallIndels.vcf.gz.tbi \ +// Manta_${idSample}.candidateSmallIndels.vcf.gz.tbi +// mv Manta/results/variants/candidateSV.vcf.gz \ +// Manta_${idSample}.candidateSV.vcf.gz +// mv Manta/results/variants/candidateSV.vcf.gz.tbi \ +// Manta_${idSample}.candidateSV.vcf.gz.tbi +// mv Manta/results/variants/${vcftype}SV.vcf.gz \ +// Manta_${idSample}.${vcftype}SV.vcf.gz +// mv Manta/results/variants/${vcftype}SV.vcf.gz.tbi \ +// Manta_${idSample}.${vcftype}SV.vcf.gz.tbi +// """ +// } - input: - set idPatient, idSample, file(bam), file(bai) from bamTIDDIT - file(fasta) from ch_fasta - file(fastaFai) from ch_fai +// vcfMantaSingle = vcfMantaSingle.dump(tag:'Single Manta') - output: - set val("TIDDIT"), idPatient, idSample, file("*.vcf.gz"), file("*.tbi") into vcfTIDDIT - set file("TIDDIT_${idSample}.old.vcf"), file("TIDDIT_${idSample}.ploidy.tab"), file("TIDDIT_${idSample}.signals.tab"), file("TIDDIT_${idSample}.wig"), file("TIDDIT_${idSample}.gc.wig") into tidditOut +// // STEP TIDDIT - when: 'tiddit' in tools +// process TIDDIT { +// tag "${idSample}" - script: - """ - tiddit --sv -o TIDDIT_${idSample} --bam ${bam} --ref ${fasta} +// publishDir params.outdir, mode: params.publish_dir_mode, +// saveAs: { +// if (it == "TIDDIT_${idSample}.vcf") "VariantCalling/${idSample}/TIDDIT/${it}" +// else "Reports/${idSample}/TIDDIT/${it}" +// } + +// input: +// set idPatient, idSample, file(bam), file(bai) from bamTIDDIT +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai + +// output: +// set val("TIDDIT"), idPatient, idSample, file("*.vcf.gz"), file("*.tbi") into vcfTIDDIT +// set file("TIDDIT_${idSample}.old.vcf"), file("TIDDIT_${idSample}.ploidy.tab"), file("TIDDIT_${idSample}.signals.tab"), file("TIDDIT_${idSample}.wig"), file("TIDDIT_${idSample}.gc.wig") into tidditOut + +// when: 'tiddit' in tools + +// script: +// """ +// tiddit --sv -o TIDDIT_${idSample} --bam ${bam} --ref ${fasta} - mv TIDDIT_${idSample}.vcf TIDDIT_${idSample}.old.vcf +// mv TIDDIT_${idSample}.vcf TIDDIT_${idSample}.old.vcf - grep -E "#|PASS" TIDDIT_${idSample}.old.vcf > TIDDIT_${idSample}.vcf +// grep -E "#|PASS" TIDDIT_${idSample}.old.vcf > TIDDIT_${idSample}.vcf - bgzip --threads ${task.cpus} -c TIDDIT_${idSample}.vcf > TIDDIT_${idSample}.vcf.gz +// bgzip --threads ${task.cpus} -c TIDDIT_${idSample}.vcf > TIDDIT_${idSample}.vcf.gz - tabix TIDDIT_${idSample}.vcf.gz - """ -} +// tabix TIDDIT_${idSample}.vcf.gz +// """ +// } -vcfTIDDIT = vcfTIDDIT.dump(tag:'TIDDIT') +// vcfTIDDIT = vcfTIDDIT.dump(tag:'TIDDIT') -// STEP FREEBAYES SINGLE MODE +// // STEP FREEBAYES SINGLE MODE -process FreebayesSingle { - tag "${idSample}-${intervalBed.baseName}" +// process FreebayesSingle { +// tag "${idSample}-${intervalBed.baseName}" - label 'cpus_1' +// label 'cpus_1' - input: - set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamFreebayesSingle - file(fasta) from ch_fasta - file(fastaFai) from ch_software_versions_yaml +// input: +// set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamFreebayesSingle +// file(fasta) from ch_fasta +// file(fastaFai) from ch_software_versions_yaml - output: - set val("FreeBayes"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.vcf") into vcfFreebayesSingle +// output: +// set val("FreeBayes"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.vcf") into vcfFreebayesSingle - when: 'freebayes' in tools - - script: - intervalsOptions = params.no_intervals ? "" : "-t ${intervalBed}" - """ - freebayes \ - -f ${fasta} \ - --min-alternate-fraction 0.1 \ - --min-mapping-quality 1 \ - ${intervalsOptions} \ - ${bam} > ${intervalBed.baseName}_${idSample}.vcf - """ -} +// when: 'freebayes' in tools + +// script: +// intervalsOptions = params.no_intervals ? "" : "-t ${intervalBed}" +// """ +// freebayes \ +// -f ${fasta} \ +// --min-alternate-fraction 0.1 \ +// --min-mapping-quality 1 \ +// ${intervalsOptions} \ +// ${bam} > ${intervalBed.baseName}_${idSample}.vcf +// """ +// } -vcfFreebayesSingle = vcfFreebayesSingle.groupTuple(by: [0,1,2]) +// vcfFreebayesSingle = vcfFreebayesSingle.groupTuple(by: [0,1,2]) + +// /* +// ================================================================================ +// SOMATIC VARIANT CALLING +// ================================================================================ +// */ +// // Ascat, pileup, pileups with no intervals, recalibrated BAMs +// (bamAscat, bamMpileup, bamMpileupNoInt, bamRecalAll) = bamRecalAll.into(4) + +// // separate BAM by status +// bamNormal = Channel.create() +// bamTumor = Channel.create() + +// bamRecalAll +// .choice(bamTumor, bamNormal) {statusMap[it[0], it[1]] == 0 ? 1 : 0} + +// // Crossing Normal and Tumor to get a T/N pair for Somatic Variant Calling +// // Remapping channel to remove common key idPatient +// pairBam = bamNormal.cross(bamTumor).map { +// normal, tumor -> +// [normal[0], normal[1], normal[2], normal[3], tumor[1], tumor[2], tumor[3]] +// } + +// pairBam = pairBam.dump(tag:'BAM Somatic Pair') + +// // Manta, Strelka, Mutect2, MSIsensor +// (pairBamManta, pairBamStrelka, pairBamStrelkaBP, pairBamCalculateContamination, pairBamFilterMutect2, pairBamMsisensor, pairBamCNVkit, pairBam) = pairBam.into(8) + +// // Making Pair Bam for Sention + +// // separate BAM by status +// bam_sention_normal = Channel.create() +// bam_sentieon_tumor = Channel.create() + +// bam_sentieon_all +// .choice(bam_sentieon_tumor, bam_sention_normal) {statusMap[it[0], it[1]] == 0 ? 1 : 0} + +// // Crossing Normal and Tumor to get a T/N pair for Somatic Variant Calling +// // Remapping channel to remove common key idPatient + +// bam_pair_sentieon_TNscope = bam_sention_normal.cross(bam_sentieon_tumor).map { +// normal, tumor -> +// [normal[0], normal[1], normal[2], normal[3], normal[4], tumor[1], tumor[2], tumor[3], tumor[4]] +// } + +// intervalPairBam = pairBam.spread(bedIntervals) + +// bamMpileup = bamMpileup.spread(intMpileup) + +// // intervals for Mutect2 calls, FreeBayes and pileups for Mutect2 filtering +// (pairBamMutect2, pairBamFreeBayes, pairBamPileupSummaries) = intervalPairBam.into(3) + +// // STEP FREEBAYES + +// process FreeBayes { +// tag "${idSampleTumor}_vs_${idSampleNormal}-${intervalBed.baseName}" + +// label 'cpus_1' + +// input: +// set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(intervalBed) from pairBamFreeBayes +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai + +// output: +// set val("FreeBayes"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf") into vcfFreeBayes + +// when: 'freebayes' in tools + +// script: +// intervalsOptions = params.no_intervals ? "" : "-t ${intervalBed}" +// """ +// freebayes \ +// -f ${fasta} \ +// --pooled-continuous \ +// --pooled-discrete \ +// --genotype-qualities \ +// --report-genotype-likelihood-max \ +// --allele-balance-priors-off \ +// --min-alternate-fraction 0.03 \ +// --min-repeat-entropy 1 \ +// --min-alternate-count 2 \ +// ${intervalsOptions} \ +// ${bamTumor} \ +// ${bamNormal} > ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf +// """ +// } + +// vcfFreeBayes = vcfFreeBayes.groupTuple(by:[0,1,2]) + +// // STEP GATK MUTECT2.1 - RAW CALLS + +// process Mutect2 { +// tag "${idSampleTumor}_vs_${idSampleNormal}-${intervalBed.baseName}" + +// label 'cpus_1' + +// input: +// set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(intervalBed) from pairBamMutect2 +// file(dict) from ch_dict +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai +// file(germlineResource) from ch_germline_resource +// file(germlineResourceIndex) from ch_germline_resource_tbi +// file(intervals) from ch_intervals +// file(pon) from ch_pon +// file(ponIndex) from ch_pon_tbi + +// output: +// set val("Mutect2"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf") into mutect2Output +// set idPatient, idSampleNormal, idSampleTumor, file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf.stats") optional true into intervalStatsFiles +// set idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf.stats"), file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf") optional true into mutect2Stats + +// when: 'mutect2' in tools + +// script: +// // please make a panel-of-normals, using at least 40 samples +// // https://gatkforums.broadinstitute.org/gatk/discussion/11136/how-to-call-somatic-mutations-using-gatk4-mutect2 +// PON = params.pon ? "--panel-of-normals ${pon}" : "" +// intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" +// softClippedOption = params.ignore_soft_clipped_bases ? "--dont-use-soft-clipped-bases true" : "" +// """ +// # Get raw calls +// gatk --java-options "-Xmx${task.memory.toGiga()}g" \ +// Mutect2 \ +// -R ${fasta}\ +// -I ${bamTumor} -tumor ${idSampleTumor} \ +// -I ${bamNormal} -normal ${idSampleNormal} \ +// ${intervalsOptions} \ +// ${softClippedOption} \ +// --germline-resource ${germlineResource} \ +// ${PON} \ +// -O ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf +// """ +// } + +// mutect2Output = mutect2Output.groupTuple(by:[0,1,2]) +// mutect2Stats = mutect2Stats.groupTuple(by:[0,1]) + +// // STEP GATK MUTECT2.2 - MERGING STATS + +// process MergeMutect2Stats { +// tag "${idSamplePair}" + +// publishDir "${params.outdir}/VariantCalling/${idSamplePair}/Mutect2", mode: params.publish_dir_mode + +// input: +// set idPatient, idSamplePair, file(statsFiles), file(vcf) from mutect2Stats // Actual stats files and corresponding VCF chunks +// file(dict) from ch_dict +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai +// file(germlineResource) from ch_germline_resource +// file(germlineResourceIndex) from ch_germline_resource_tbi +// file(intervals) from ch_intervals + +// output: +// set idPatient, idSamplePair, file("${idSamplePair}.vcf.gz.stats") into mergedStatsFile + +// when: 'mutect2' in tools + +// script: +// stats = statsFiles.collect{ "-stats ${it} " }.join(' ') +// """ +// gatk --java-options "-Xmx${task.memory.toGiga()}g" \ +// MergeMutectStats \ +// ${stats} \ +// -O ${idSamplePair}.vcf.gz.stats +// """ +// } + +// // we are merging the VCFs that are called separatelly for different intervals +// // so we can have a single sorted VCF containing all the calls for a given caller + +// // STEP MERGING VCF - FREEBAYES & GATK HAPLOTYPECALLER + +// vcfConcatenateVCFs = vcfFreeBayes.mix(vcfFreebayesSingle, vcfGenotypeGVCFs, gvcfHaplotypeCaller) +// vcfConcatenateVCFs = vcfConcatenateVCFs.dump(tag:'VCF to merge') + +// process ConcatVCF { +// label 'cpus_8' + +// tag "${variantCaller}-${idSample}" + +// publishDir "${params.outdir}/VariantCalling/${idSample}/${"$variantCaller"}", mode: params.publish_dir_mode + +// input: +// set variantCaller, idPatient, idSample, file(vcf) from vcfConcatenateVCFs +// file(fastaFai) from ch_fai +// file(targetBED) from ch_target_bed -/* -================================================================================ - SOMATIC VARIANT CALLING -================================================================================ -*/ -// Ascat, pileup, pileups with no intervals, recalibrated BAMs -(bamAscat, bamMpileup, bamMpileupNoInt, bamRecalAll) = bamRecalAll.into(4) +// output: +// // we have this funny *_* pattern to avoid copying the raw calls to publishdir +// set variantCaller, idPatient, idSample, file("*_*.vcf.gz"), file("*_*.vcf.gz.tbi") into vcfConcatenated + +// when: ('haplotypecaller' in tools || 'mutect2' in tools || 'freebayes' in tools) + +// script: +// if (variantCaller == 'HaplotypeCallerGVCF') +// outputFile = "HaplotypeCaller_${idSample}.g.vcf" +// else +// outputFile = "${variantCaller}_${idSample}.vcf" +// options = params.target_bed ? "-t ${targetBED}" : "" +// intervalsOptions = params.no_intervals ? "-n" : "" +// """ +// concatenateVCFs.sh -i ${fastaFai} -c ${task.cpus} -o ${outputFile} ${options} ${intervalsOptions} +// """ +// } -// separate BAM by status -bamNormal = Channel.create() -bamTumor = Channel.create() +// vcfConcatenated = vcfConcatenated.dump(tag:'VCF') -bamRecalAll - .choice(bamTumor, bamNormal) {statusMap[it[0], it[1]] == 0 ? 1 : 0} +// // STEP MERGING VCF - GATK MUTECT2 (UNFILTERED) -// Crossing Normal and Tumor to get a T/N pair for Somatic Variant Calling -// Remapping channel to remove common key idPatient -pairBam = bamNormal.cross(bamTumor).map { - normal, tumor -> - [normal[0], normal[1], normal[2], normal[3], tumor[1], tumor[2], tumor[3]] -} +// mutect2Output = mutect2Output.dump(tag:'Mutect2 output VCF to merge') + +// process ConcatVCF_Mutect2 { +// label 'cpus_8' + +// tag "${idSample}" -pairBam = pairBam.dump(tag:'BAM Somatic Pair') +// publishDir "${params.outdir}/VariantCalling/${idSample}/Mutect2", mode: params.publish_dir_mode -// Manta, Strelka, Mutect2, MSIsensor -(pairBamManta, pairBamStrelka, pairBamStrelkaBP, pairBamCalculateContamination, pairBamFilterMutect2, pairBamMsisensor, pairBamCNVkit, pairBam) = pairBam.into(8) +// input: +// set variantCaller, idPatient, idSample, file(vcf) from mutect2Output +// file(fastaFai) from ch_fai +// file(targetBED) from ch_target_bed -// Making Pair Bam for Sention +// output: +// // we have this funny *_* pattern to avoid copying the raw calls to publishdir +// set variantCaller, idPatient, idSample, file("*_*.vcf.gz"), file("*_*.vcf.gz.tbi") into vcfConcatenatedForFilter -// separate BAM by status -bam_sention_normal = Channel.create() -bam_sentieon_tumor = Channel.create() +// when: ('haplotypecaller' in tools || 'mutect2' in tools || 'freebayes' in tools) -bam_sentieon_all - .choice(bam_sentieon_tumor, bam_sention_normal) {statusMap[it[0], it[1]] == 0 ? 1 : 0} +// script: +// outputFile = "Mutect2_unfiltered_${idSample}.vcf" +// options = params.target_bed ? "-t ${targetBED}" : "" +// intervalsOptions = params.no_intervals ? "-n" : "" +// """ +// concatenateVCFs.sh -i ${fastaFai} -c ${task.cpus} -o ${outputFile} ${options} ${intervalsOptions} +// """ +// } -// Crossing Normal and Tumor to get a T/N pair for Somatic Variant Calling -// Remapping channel to remove common key idPatient +// vcfConcatenatedForFilter = vcfConcatenatedForFilter.dump(tag:'Mutect2 unfiltered VCF') -bam_pair_sentieon_TNscope = bam_sention_normal.cross(bam_sentieon_tumor).map { - normal, tumor -> - [normal[0], normal[1], normal[2], normal[3], normal[4], tumor[1], tumor[2], tumor[3], tumor[4]] -} +// // STEP GATK MUTECT2.3 - GENERATING PILEUP SUMMARIES -intervalPairBam = pairBam.spread(bedIntervals) +// pairBamPileupSummaries = pairBamPileupSummaries.map{ +// idPatient, idSampleNormal, bamNormal, baiNormal, idSampleTumor, bamTumor, baiTumor, intervalBed -> +// [idPatient, idSampleNormal, idSampleTumor, bamNormal, baiNormal, bamTumor, baiTumor, intervalBed] +// }.join(intervalStatsFiles, by:[0,1,2]) -bamMpileup = bamMpileup.spread(intMpileup) +// process PileupSummariesForMutect2 { +// tag "${idSampleTumor}_vs_${idSampleNormal}-${intervalBed.baseName}" -// intervals for Mutect2 calls, FreeBayes and pileups for Mutect2 filtering -(pairBamMutect2, pairBamFreeBayes, pairBamPileupSummaries) = intervalPairBam.into(3) +// label 'cpus_1' -// STEP FREEBAYES +// input: +// set idPatient, idSampleNormal, idSampleTumor, file(bamNormal), file(baiNormal), file(bamTumor), file(baiTumor), file(intervalBed), file(statsFile) from pairBamPileupSummaries +// file(germlineResource) from ch_germline_resource +// file(germlineResourceIndex) from ch_germline_resource_tbi -process FreeBayes { - tag "${idSampleTumor}_vs_${idSampleNormal}-${intervalBed.baseName}" +// output: +// set idPatient, idSampleNormal, idSampleTumor, file("${intervalBed.baseName}_${idSampleTumor}_pileupsummaries.table") into pileupSummaries - label 'cpus_1' +// when: 'mutect2' in tools - input: - set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(intervalBed) from pairBamFreeBayes - file(fasta) from ch_fasta - file(fastaFai) from ch_fai +// script: +// intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" +// """ +// gatk --java-options "-Xmx${task.memory.toGiga()}g" \ +// GetPileupSummaries \ +// -I ${bamTumor} \ +// -V ${germlineResource} \ +// ${intervalsOptions} \ +// -O ${intervalBed.baseName}_${idSampleTumor}_pileupsummaries.table +// """ +// } - output: - set val("FreeBayes"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf") into vcfFreeBayes +// pileupSummaries = pileupSummaries.groupTuple(by:[0,1,2]) - when: 'freebayes' in tools +// // STEP GATK MUTECT2.4 - MERGING PILEUP SUMMARIES - script: - intervalsOptions = params.no_intervals ? "" : "-t ${intervalBed}" - """ - freebayes \ - -f ${fasta} \ - --pooled-continuous \ - --pooled-discrete \ - --genotype-qualities \ - --report-genotype-likelihood-max \ - --allele-balance-priors-off \ - --min-alternate-fraction 0.03 \ - --min-repeat-entropy 1 \ - --min-alternate-count 2 \ - ${intervalsOptions} \ - ${bamTumor} \ - ${bamNormal} > ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf - """ -} +// process MergePileupSummaries { +// label 'cpus_1' -vcfFreeBayes = vcfFreeBayes.groupTuple(by:[0,1,2]) - -// STEP GATK MUTECT2.1 - RAW CALLS - -process Mutect2 { - tag "${idSampleTumor}_vs_${idSampleNormal}-${intervalBed.baseName}" - - label 'cpus_1' - - input: - set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(intervalBed) from pairBamMutect2 - file(dict) from ch_dict - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - file(germlineResource) from ch_germline_resource - file(germlineResourceIndex) from ch_germline_resource_tbi - file(intervals) from ch_intervals - file(pon) from ch_pon - file(ponIndex) from ch_pon_tbi - - output: - set val("Mutect2"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf") into mutect2Output - set idPatient, idSampleNormal, idSampleTumor, file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf.stats") optional true into intervalStatsFiles - set idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf.stats"), file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf") optional true into mutect2Stats - - when: 'mutect2' in tools - - script: - // please make a panel-of-normals, using at least 40 samples - // https://gatkforums.broadinstitute.org/gatk/discussion/11136/how-to-call-somatic-mutations-using-gatk4-mutect2 - PON = params.pon ? "--panel-of-normals ${pon}" : "" - intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" - softClippedOption = params.ignore_soft_clipped_bases ? "--dont-use-soft-clipped-bases true" : "" - """ - # Get raw calls - gatk --java-options "-Xmx${task.memory.toGiga()}g" \ - Mutect2 \ - -R ${fasta}\ - -I ${bamTumor} -tumor ${idSampleTumor} \ - -I ${bamNormal} -normal ${idSampleNormal} \ - ${intervalsOptions} \ - ${softClippedOption} \ - --germline-resource ${germlineResource} \ - ${PON} \ - -O ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf - """ -} +// tag "${idPatient}_${idSampleTumor}" -mutect2Output = mutect2Output.groupTuple(by:[0,1,2]) -mutect2Stats = mutect2Stats.groupTuple(by:[0,1]) +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}/Mutect2", mode: params.publish_dir_mode -// STEP GATK MUTECT2.2 - MERGING STATS +// input: +// set idPatient, idSampleNormal, idSampleTumor, file(pileupSums) from pileupSummaries +// file(dict) from ch_dict -process MergeMutect2Stats { - tag "${idSamplePair}" +// output: +// set idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}_pileupsummaries.table") into mergedPileupFile - publishDir "${params.outdir}/VariantCalling/${idSamplePair}/Mutect2", mode: params.publish_dir_mode +// when: 'mutect2' in tools - input: - set idPatient, idSamplePair, file(statsFiles), file(vcf) from mutect2Stats // Actual stats files and corresponding VCF chunks - file(dict) from ch_dict - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - file(germlineResource) from ch_germline_resource - file(germlineResourceIndex) from ch_germline_resource_tbi - file(intervals) from ch_intervals +// script: +// allPileups = pileupSums.collect{ "-I ${it} " }.join(' ') +// """ +// gatk --java-options "-Xmx${task.memory.toGiga()}g" \ +// GatherPileupSummaries \ +// --sequence-dictionary ${dict} \ +// ${allPileups} \ +// -O ${idSampleTumor}_pileupsummaries.table +// """ +// } - output: - set idPatient, idSamplePair, file("${idSamplePair}.vcf.gz.stats") into mergedStatsFile +// // STEP GATK MUTECT2.5 - CALCULATING CONTAMINATION - when: 'mutect2' in tools +// pairBamCalculateContamination = pairBamCalculateContamination.map{ +// idPatient, idSampleNormal, bamNormal, baiNormal, idSampleTumor, bamTumor, baiTumor -> +// [idPatient, idSampleNormal, idSampleTumor, bamNormal, baiNormal, bamTumor, baiTumor] +// }.join(mergedPileupFile, by:[0,1,2]) - script: - stats = statsFiles.collect{ "-stats ${it} " }.join(' ') - """ - gatk --java-options "-Xmx${task.memory.toGiga()}g" \ - MergeMutectStats \ - ${stats} \ - -O ${idSamplePair}.vcf.gz.stats - """ -} +// process CalculateContamination { +// label 'cpus_1' -// we are merging the VCFs that are called separatelly for different intervals -// so we can have a single sorted VCF containing all the calls for a given caller +// tag "${idSampleTumor}_vs_${idSampleNormal}" -// STEP MERGING VCF - FREEBAYES & GATK HAPLOTYPECALLER - -vcfConcatenateVCFs = vcfFreeBayes.mix(vcfFreebayesSingle, vcfGenotypeGVCFs, gvcfHaplotypeCaller) -vcfConcatenateVCFs = vcfConcatenateVCFs.dump(tag:'VCF to merge') - -process ConcatVCF { - label 'cpus_8' - - tag "${variantCaller}-${idSample}" - - publishDir "${params.outdir}/VariantCalling/${idSample}/${"$variantCaller"}", mode: params.publish_dir_mode - - input: - set variantCaller, idPatient, idSample, file(vcf) from vcfConcatenateVCFs - file(fastaFai) from ch_fai - file(targetBED) from ch_target_bed - - output: - // we have this funny *_* pattern to avoid copying the raw calls to publishdir - set variantCaller, idPatient, idSample, file("*_*.vcf.gz"), file("*_*.vcf.gz.tbi") into vcfConcatenated - - when: ('haplotypecaller' in tools || 'mutect2' in tools || 'freebayes' in tools) - - script: - if (variantCaller == 'HaplotypeCallerGVCF') - outputFile = "HaplotypeCaller_${idSample}.g.vcf" - else - outputFile = "${variantCaller}_${idSample}.vcf" - options = params.target_bed ? "-t ${targetBED}" : "" - intervalsOptions = params.no_intervals ? "-n" : "" - """ - concatenateVCFs.sh -i ${fastaFai} -c ${task.cpus} -o ${outputFile} ${options} ${intervalsOptions} - """ -} - -vcfConcatenated = vcfConcatenated.dump(tag:'VCF') - -// STEP MERGING VCF - GATK MUTECT2 (UNFILTERED) - -mutect2Output = mutect2Output.dump(tag:'Mutect2 output VCF to merge') - -process ConcatVCF_Mutect2 { - label 'cpus_8' - - tag "${idSample}" - - publishDir "${params.outdir}/VariantCalling/${idSample}/Mutect2", mode: params.publish_dir_mode - - input: - set variantCaller, idPatient, idSample, file(vcf) from mutect2Output - file(fastaFai) from ch_fai - file(targetBED) from ch_target_bed - - output: - // we have this funny *_* pattern to avoid copying the raw calls to publishdir - set variantCaller, idPatient, idSample, file("*_*.vcf.gz"), file("*_*.vcf.gz.tbi") into vcfConcatenatedForFilter - - when: ('haplotypecaller' in tools || 'mutect2' in tools || 'freebayes' in tools) - - script: - outputFile = "Mutect2_unfiltered_${idSample}.vcf" - options = params.target_bed ? "-t ${targetBED}" : "" - intervalsOptions = params.no_intervals ? "-n" : "" - """ - concatenateVCFs.sh -i ${fastaFai} -c ${task.cpus} -o ${outputFile} ${options} ${intervalsOptions} - """ -} - -vcfConcatenatedForFilter = vcfConcatenatedForFilter.dump(tag:'Mutect2 unfiltered VCF') - -// STEP GATK MUTECT2.3 - GENERATING PILEUP SUMMARIES - -pairBamPileupSummaries = pairBamPileupSummaries.map{ - idPatient, idSampleNormal, bamNormal, baiNormal, idSampleTumor, bamTumor, baiTumor, intervalBed -> - [idPatient, idSampleNormal, idSampleTumor, bamNormal, baiNormal, bamTumor, baiTumor, intervalBed] -}.join(intervalStatsFiles, by:[0,1,2]) - -process PileupSummariesForMutect2 { - tag "${idSampleTumor}_vs_${idSampleNormal}-${intervalBed.baseName}" - - label 'cpus_1' - - input: - set idPatient, idSampleNormal, idSampleTumor, file(bamNormal), file(baiNormal), file(bamTumor), file(baiTumor), file(intervalBed), file(statsFile) from pairBamPileupSummaries - file(germlineResource) from ch_germline_resource - file(germlineResourceIndex) from ch_germline_resource_tbi - - output: - set idPatient, idSampleNormal, idSampleTumor, file("${intervalBed.baseName}_${idSampleTumor}_pileupsummaries.table") into pileupSummaries - - when: 'mutect2' in tools - - script: - intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" - """ - gatk --java-options "-Xmx${task.memory.toGiga()}g" \ - GetPileupSummaries \ - -I ${bamTumor} \ - -V ${germlineResource} \ - ${intervalsOptions} \ - -O ${intervalBed.baseName}_${idSampleTumor}_pileupsummaries.table - """ -} +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}/Mutect2", mode: params.publish_dir_mode -pileupSummaries = pileupSummaries.groupTuple(by:[0,1,2]) - -// STEP GATK MUTECT2.4 - MERGING PILEUP SUMMARIES - -process MergePileupSummaries { - label 'cpus_1' - - tag "${idPatient}_${idSampleTumor}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}/Mutect2", mode: params.publish_dir_mode - - input: - set idPatient, idSampleNormal, idSampleTumor, file(pileupSums) from pileupSummaries - file(dict) from ch_dict - - output: - set idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}_pileupsummaries.table") into mergedPileupFile - - when: 'mutect2' in tools - - script: - allPileups = pileupSums.collect{ "-I ${it} " }.join(' ') - """ - gatk --java-options "-Xmx${task.memory.toGiga()}g" \ - GatherPileupSummaries \ - --sequence-dictionary ${dict} \ - ${allPileups} \ - -O ${idSampleTumor}_pileupsummaries.table - """ -} - -// STEP GATK MUTECT2.5 - CALCULATING CONTAMINATION - -pairBamCalculateContamination = pairBamCalculateContamination.map{ - idPatient, idSampleNormal, bamNormal, baiNormal, idSampleTumor, bamTumor, baiTumor -> - [idPatient, idSampleNormal, idSampleTumor, bamNormal, baiNormal, bamTumor, baiTumor] -}.join(mergedPileupFile, by:[0,1,2]) - -process CalculateContamination { - label 'cpus_1' - - tag "${idSampleTumor}_vs_${idSampleNormal}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}/Mutect2", mode: params.publish_dir_mode - - input: - set idPatient, idSampleNormal, idSampleTumor, file(bamNormal), file(baiNormal), file(bamTumor), file(baiTumor), file(mergedPileup) from pairBamCalculateContamination +// input: +// set idPatient, idSampleNormal, idSampleTumor, file(bamNormal), file(baiNormal), file(bamTumor), file(baiTumor), file(mergedPileup) from pairBamCalculateContamination - output: - set idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${idSampleTumor}_contamination.table") into contaminationTable - - when: 'mutect2' in tools - - script: - """ - # calculate contamination - gatk --java-options "-Xmx${task.memory.toGiga()}g" \ - CalculateContamination \ - -I ${idSampleTumor}_pileupsummaries.table \ - -O ${idSampleTumor}_contamination.table - """ -} - -// STEP GATK MUTECT2.6 - FILTERING CALLS - -mutect2CallsToFilter = vcfConcatenatedForFilter.map{ - variantCaller, idPatient, idSamplePair, vcf, tbi -> - [idPatient, idSamplePair, vcf, tbi] -}.join(mergedStatsFile, by:[0,1]).join(contaminationTable, by:[0,1]) - -process FilterMutect2Calls { - label 'cpus_1' - - tag "${idSamplePair}" - - publishDir "${params.outdir}/VariantCalling/${idSamplePair}/Mutect2", mode: params.publish_dir_mode - - input: - set idPatient, idSamplePair, file(unfiltered), file(unfilteredIndex), file(stats), file(contaminationTable) from mutect2CallsToFilter - file(dict) from ch_dict - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - file(germlineResource) from ch_germline_resource - file(germlineResourceIndex) from ch_germline_resource_tbi - file(intervals) from ch_intervals - - output: - set val("Mutect2"), idPatient, idSamplePair, file("Mutect2_filtered_${idSamplePair}.vcf.gz"), file("Mutect2_filtered_${idSamplePair}.vcf.gz.tbi"), file("Mutect2_filtered_${idSamplePair}.vcf.gz.filteringStats.tsv") into filteredMutect2Output - - when: 'mutect2' in tools - - script: - """ - # do the actual filtering - gatk --java-options "-Xmx${task.memory.toGiga()}g" \ - FilterMutectCalls \ - -V ${unfiltered} \ - --contamination-table ${contaminationTable} \ - --stats ${stats} \ - -R ${fasta} \ - -O Mutect2_filtered_${idSamplePair}.vcf.gz - """ -} - -// STEP SENTIEON TNSCOPE - -process Sentieon_TNscope { - label 'cpus_max' - label 'memory_max' - label 'sentieon' - - tag "${idSampleTumor}_vs_${idSampleNormal}" - - input: - set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), file(recalNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(recalTumor) from bam_pair_sentieon_TNscope - file(dict) from ch_dict - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - file(dbsnp) from ch_dbsnp - file(dbsnpIndex) from ch_dbsnp_tbi - file(pon) from ch_pon - file(ponIndex) from ch_pon_tbi - - output: - set val("SentieonTNscope"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("*.vcf") into vcf_sentieon_TNscope - - when: 'tnscope' in tools && params.sentieon - - script: - PON = params.pon ? "--pon ${pon}" : "" - """ - sentieon driver \ - -t ${task.cpus} \ - -r ${fasta} \ - -i ${bamTumor} \ - -q ${recalTumor} \ - -i ${bamNormal} \ - -q ${recalNormal} \ - --algo TNscope \ - --tumor_sample ${idSampleTumor} \ - --normal_sample ${idSampleNormal} \ - --dbsnp ${dbsnp} \ - ${PON} \ - TNscope_${idSampleTumor}_vs_${idSampleNormal}.vcf - """ -} - -vcf_sentieon_TNscope = vcf_sentieon_TNscope.dump(tag:'Sentieon TNscope') - -vcf_sentieon = vcf_sentieon_DNAseq.mix(vcf_sentieon_DNAscope, vcf_sentieon_DNAscope_SV, vcf_sentieon_TNscope) - -process CompressSentieonVCF { - tag "${idSample} - ${vcf}" - - publishDir "${params.outdir}/VariantCalling/${idSample}/${variantCaller}", mode: params.publish_dir_mode - - input: - set variantCaller, idPatient, idSample, file(vcf) from vcf_sentieon - - output: - set variantCaller, idPatient, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcf_sentieon_compressed - - script: - """ - bgzip < ${vcf} > ${vcf}.gz - tabix ${vcf}.gz - """ -} - -vcf_sentieon_compressed = vcf_sentieon_compressed.dump(tag:'Sentieon VCF indexed') - -// STEP STRELKA.2 - SOMATIC PAIR - -process Strelka { - label 'cpus_max' - label 'memory_max' - - tag "${idSampleTumor}_vs_${idSampleNormal}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/Strelka", mode: params.publish_dir_mode - - input: - set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamStrelka - file(dict) from ch_dict - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - file(targetBED) from ch_target_bed - - output: - set val("Strelka"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfStrelka - - when: 'strelka' in tools - - script: - beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" - options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" - """ - ${beforeScript} - configureStrelkaSomaticWorkflow.py \ - --tumor ${bamTumor} \ - --normal ${bamNormal} \ - --referenceFasta ${fasta} \ - ${options} \ - --runDir Strelka - - python Strelka/runWorkflow.py -m local -j ${task.cpus} - - mv Strelka/results/variants/somatic.indels.vcf.gz \ - Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz - mv Strelka/results/variants/somatic.indels.vcf.gz.tbi \ - Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi - mv Strelka/results/variants/somatic.snvs.vcf.gz \ - Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz - mv Strelka/results/variants/somatic.snvs.vcf.gz.tbi \ - Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi - """ -} - -vcfStrelka = vcfStrelka.dump(tag:'Strelka') - -// STEP MANTA.2 - SOMATIC PAIR - -process Manta { - label 'cpus_max' - label 'memory_max' - - tag "${idSampleTumor}_vs_${idSampleNormal}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/Manta", mode: params.publish_dir_mode - - input: - set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamManta - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - file(targetBED) from ch_target_bed - - output: - set val("Manta"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfManta - set idPatient, idSampleNormal, idSampleTumor, file("*.candidateSmallIndels.vcf.gz"), file("*.candidateSmallIndels.vcf.gz.tbi") into mantaToStrelka - - when: 'manta' in tools - - script: - beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" - options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" - """ - ${beforeScript} - configManta.py \ - --normalBam ${bamNormal} \ - --tumorBam ${bamTumor} \ - --reference ${fasta} \ - ${options} \ - --runDir Manta - - python Manta/runWorkflow.py -m local -j ${task.cpus} - - mv Manta/results/variants/candidateSmallIndels.vcf.gz \ - Manta_${idSampleTumor}_vs_${idSampleNormal}.candidateSmallIndels.vcf.gz - mv Manta/results/variants/candidateSmallIndels.vcf.gz.tbi \ - Manta_${idSampleTumor}_vs_${idSampleNormal}.candidateSmallIndels.vcf.gz.tbi - mv Manta/results/variants/candidateSV.vcf.gz \ - Manta_${idSampleTumor}_vs_${idSampleNormal}.candidateSV.vcf.gz - mv Manta/results/variants/candidateSV.vcf.gz.tbi \ - Manta_${idSampleTumor}_vs_${idSampleNormal}.candidateSV.vcf.gz.tbi - mv Manta/results/variants/diploidSV.vcf.gz \ - Manta_${idSampleTumor}_vs_${idSampleNormal}.diploidSV.vcf.gz - mv Manta/results/variants/diploidSV.vcf.gz.tbi \ - Manta_${idSampleTumor}_vs_${idSampleNormal}.diploidSV.vcf.gz.tbi - mv Manta/results/variants/somaticSV.vcf.gz \ - Manta_${idSampleTumor}_vs_${idSampleNormal}.somaticSV.vcf.gz - mv Manta/results/variants/somaticSV.vcf.gz.tbi \ - Manta_${idSampleTumor}_vs_${idSampleNormal}.somaticSV.vcf.gz.tbi - """ -} - -vcfManta = vcfManta.dump(tag:'Manta') - -// Remmaping channels to match input for StrelkaBP -pairBamStrelkaBP = pairBamStrelkaBP.map { - idPatientNormal, idSampleNormal, bamNormal, baiNormal, idSampleTumor, bamTumor, baiTumor -> - [idPatientNormal, idSampleNormal, idSampleTumor, bamNormal, baiNormal, bamTumor, baiTumor] -}.join(mantaToStrelka, by:[0,1,2]).map { - idPatientNormal, idSampleNormal, idSampleTumor, bamNormal, baiNormal, bamTumor, baiTumor, mantaCSI, mantaCSIi -> - [idPatientNormal, idSampleNormal, bamNormal, baiNormal, idSampleTumor, bamTumor, baiTumor, mantaCSI, mantaCSIi] -} - -// STEP STRELKA.3 - SOMATIC PAIR - BEST PRACTICES - -process StrelkaBP { - label 'cpus_max' - label 'memory_max' - - tag "${idSampleTumor}_vs_${idSampleNormal}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/Strelka", mode: params.publish_dir_mode - - input: - set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(mantaCSI), file(mantaCSIi) from pairBamStrelkaBP - file(dict) from ch_dict - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - file(targetBED) from ch_target_bed - - output: - set val("Strelka"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfStrelkaBP - - when: 'strelka' in tools && 'manta' in tools && !params.no_strelka_bp - - script: - beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" - options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" - """ - ${beforeScript} - configureStrelkaSomaticWorkflow.py \ - --tumor ${bamTumor} \ - --normal ${bamNormal} \ - --referenceFasta ${fasta} \ - --indelCandidates ${mantaCSI} \ - ${options} \ - --runDir Strelka - - python Strelka/runWorkflow.py -m local -j ${task.cpus} - - mv Strelka/results/variants/somatic.indels.vcf.gz \ - StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz - mv Strelka/results/variants/somatic.indels.vcf.gz.tbi \ - StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi - mv Strelka/results/variants/somatic.snvs.vcf.gz \ - StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz - mv Strelka/results/variants/somatic.snvs.vcf.gz.tbi \ - StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi - """ -} - -vcfStrelkaBP = vcfStrelkaBP.dump(tag:'Strelka BP') - -// STEP CNVkit - -process CNVkit { - tag "${idSampleTumor}_vs_${idSampleNormal}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/CNVkit", mode: params.publish_dir_mode - - input: - set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamCNVkit - file(targetBED) from ch_target_bed - file(fasta) from ch_fasta - - output: - set idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}*"), file("${idSampleNormal}*") into cnvkitOut - - when: 'cnvkit' in tools && params.target_bed - - script: - """ - cnvkit.py \ - batch \ - ${bamTumor} \ - --normal ${bamNormal} \ - --targets ${targetBED} \ - --fasta ${fasta} \ - --output-reference output_reference.cnn \ - --output-dir ./ \ - --diagram \ - --scatter - """ -} - -// STEP MSISENSOR.1 - SCAN - -// Scan reference genome for microsatellites -process MSIsensor_scan { - label 'cpus_1' - label 'memory_max' - - tag "${fasta}" - - input: - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - - output: - file "microsatellites.list" into msi_scan_ch - - when: 'msisensor' in tools - - script: - """ - msisensor scan -d ${fasta} -o microsatellites.list - """ -} - -// STEP MSISENSOR.2 - SCORE - -// Score the normal vs somatic pair of bams - -process MSIsensor_msi { - label 'cpus_4' - label 'memory_max' - - tag "${idSampleTumor}_vs_${idSampleNormal}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/MSIsensor", mode: params.publish_dir_mode - - input: - set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamMsisensor - file msiSites from msi_scan_ch - - output: - set val("Msisensor"), idPatient, file("${idSampleTumor}_vs_${idSampleNormal}_msisensor"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_dis"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_germline"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_somatic") into msisensor_out_ch +// output: +// set idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${idSampleTumor}_contamination.table") into contaminationTable + +// when: 'mutect2' in tools + +// script: +// """ +// # calculate contamination +// gatk --java-options "-Xmx${task.memory.toGiga()}g" \ +// CalculateContamination \ +// -I ${idSampleTumor}_pileupsummaries.table \ +// -O ${idSampleTumor}_contamination.table +// """ +// } + +// // STEP GATK MUTECT2.6 - FILTERING CALLS + +// mutect2CallsToFilter = vcfConcatenatedForFilter.map{ +// variantCaller, idPatient, idSamplePair, vcf, tbi -> +// [idPatient, idSamplePair, vcf, tbi] +// }.join(mergedStatsFile, by:[0,1]).join(contaminationTable, by:[0,1]) + +// process FilterMutect2Calls { +// label 'cpus_1' + +// tag "${idSamplePair}" + +// publishDir "${params.outdir}/VariantCalling/${idSamplePair}/Mutect2", mode: params.publish_dir_mode + +// input: +// set idPatient, idSamplePair, file(unfiltered), file(unfilteredIndex), file(stats), file(contaminationTable) from mutect2CallsToFilter +// file(dict) from ch_dict +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai +// file(germlineResource) from ch_germline_resource +// file(germlineResourceIndex) from ch_germline_resource_tbi +// file(intervals) from ch_intervals + +// output: +// set val("Mutect2"), idPatient, idSamplePair, file("Mutect2_filtered_${idSamplePair}.vcf.gz"), file("Mutect2_filtered_${idSamplePair}.vcf.gz.tbi"), file("Mutect2_filtered_${idSamplePair}.vcf.gz.filteringStats.tsv") into filteredMutect2Output + +// when: 'mutect2' in tools + +// script: +// """ +// # do the actual filtering +// gatk --java-options "-Xmx${task.memory.toGiga()}g" \ +// FilterMutectCalls \ +// -V ${unfiltered} \ +// --contamination-table ${contaminationTable} \ +// --stats ${stats} \ +// -R ${fasta} \ +// -O Mutect2_filtered_${idSamplePair}.vcf.gz +// """ +// } + +// // STEP SENTIEON TNSCOPE + +// process Sentieon_TNscope { +// label 'cpus_max' +// label 'memory_max' +// label 'sentieon' + +// tag "${idSampleTumor}_vs_${idSampleNormal}" + +// input: +// set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), file(recalNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(recalTumor) from bam_pair_sentieon_TNscope +// file(dict) from ch_dict +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai +// file(dbsnp) from ch_dbsnp +// file(dbsnpIndex) from ch_dbsnp_tbi +// file(pon) from ch_pon +// file(ponIndex) from ch_pon_tbi + +// output: +// set val("SentieonTNscope"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("*.vcf") into vcf_sentieon_TNscope + +// when: 'tnscope' in tools && params.sentieon + +// script: +// PON = params.pon ? "--pon ${pon}" : "" +// """ +// sentieon driver \ +// -t ${task.cpus} \ +// -r ${fasta} \ +// -i ${bamTumor} \ +// -q ${recalTumor} \ +// -i ${bamNormal} \ +// -q ${recalNormal} \ +// --algo TNscope \ +// --tumor_sample ${idSampleTumor} \ +// --normal_sample ${idSampleNormal} \ +// --dbsnp ${dbsnp} \ +// ${PON} \ +// TNscope_${idSampleTumor}_vs_${idSampleNormal}.vcf +// """ +// } + +// vcf_sentieon_TNscope = vcf_sentieon_TNscope.dump(tag:'Sentieon TNscope') + +// vcf_sentieon = vcf_sentieon_DNAseq.mix(vcf_sentieon_DNAscope, vcf_sentieon_DNAscope_SV, vcf_sentieon_TNscope) + +// process CompressSentieonVCF { +// tag "${idSample} - ${vcf}" + +// publishDir "${params.outdir}/VariantCalling/${idSample}/${variantCaller}", mode: params.publish_dir_mode + +// input: +// set variantCaller, idPatient, idSample, file(vcf) from vcf_sentieon + +// output: +// set variantCaller, idPatient, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcf_sentieon_compressed + +// script: +// """ +// bgzip < ${vcf} > ${vcf}.gz +// tabix ${vcf}.gz +// """ +// } + +// vcf_sentieon_compressed = vcf_sentieon_compressed.dump(tag:'Sentieon VCF indexed') + +// // STEP STRELKA.2 - SOMATIC PAIR + +// process Strelka { +// label 'cpus_max' +// label 'memory_max' + +// tag "${idSampleTumor}_vs_${idSampleNormal}" + +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/Strelka", mode: params.publish_dir_mode + +// input: +// set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamStrelka +// file(dict) from ch_dict +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai +// file(targetBED) from ch_target_bed + +// output: +// set val("Strelka"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfStrelka + +// when: 'strelka' in tools + +// script: +// beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" +// options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" +// """ +// ${beforeScript} +// configureStrelkaSomaticWorkflow.py \ +// --tumor ${bamTumor} \ +// --normal ${bamNormal} \ +// --referenceFasta ${fasta} \ +// ${options} \ +// --runDir Strelka + +// python Strelka/runWorkflow.py -m local -j ${task.cpus} + +// mv Strelka/results/variants/somatic.indels.vcf.gz \ +// Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz +// mv Strelka/results/variants/somatic.indels.vcf.gz.tbi \ +// Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi +// mv Strelka/results/variants/somatic.snvs.vcf.gz \ +// Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz +// mv Strelka/results/variants/somatic.snvs.vcf.gz.tbi \ +// Strelka_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi +// """ +// } + +// vcfStrelka = vcfStrelka.dump(tag:'Strelka') + +// // STEP MANTA.2 - SOMATIC PAIR + +// process Manta { +// label 'cpus_max' +// label 'memory_max' + +// tag "${idSampleTumor}_vs_${idSampleNormal}" + +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/Manta", mode: params.publish_dir_mode + +// input: +// set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamManta +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai +// file(targetBED) from ch_target_bed + +// output: +// set val("Manta"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfManta +// set idPatient, idSampleNormal, idSampleTumor, file("*.candidateSmallIndels.vcf.gz"), file("*.candidateSmallIndels.vcf.gz.tbi") into mantaToStrelka + +// when: 'manta' in tools + +// script: +// beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" +// options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" +// """ +// ${beforeScript} +// configManta.py \ +// --normalBam ${bamNormal} \ +// --tumorBam ${bamTumor} \ +// --reference ${fasta} \ +// ${options} \ +// --runDir Manta + +// python Manta/runWorkflow.py -m local -j ${task.cpus} + +// mv Manta/results/variants/candidateSmallIndels.vcf.gz \ +// Manta_${idSampleTumor}_vs_${idSampleNormal}.candidateSmallIndels.vcf.gz +// mv Manta/results/variants/candidateSmallIndels.vcf.gz.tbi \ +// Manta_${idSampleTumor}_vs_${idSampleNormal}.candidateSmallIndels.vcf.gz.tbi +// mv Manta/results/variants/candidateSV.vcf.gz \ +// Manta_${idSampleTumor}_vs_${idSampleNormal}.candidateSV.vcf.gz +// mv Manta/results/variants/candidateSV.vcf.gz.tbi \ +// Manta_${idSampleTumor}_vs_${idSampleNormal}.candidateSV.vcf.gz.tbi +// mv Manta/results/variants/diploidSV.vcf.gz \ +// Manta_${idSampleTumor}_vs_${idSampleNormal}.diploidSV.vcf.gz +// mv Manta/results/variants/diploidSV.vcf.gz.tbi \ +// Manta_${idSampleTumor}_vs_${idSampleNormal}.diploidSV.vcf.gz.tbi +// mv Manta/results/variants/somaticSV.vcf.gz \ +// Manta_${idSampleTumor}_vs_${idSampleNormal}.somaticSV.vcf.gz +// mv Manta/results/variants/somaticSV.vcf.gz.tbi \ +// Manta_${idSampleTumor}_vs_${idSampleNormal}.somaticSV.vcf.gz.tbi +// """ +// } + +// vcfManta = vcfManta.dump(tag:'Manta') + +// // Remmaping channels to match input for StrelkaBP +// pairBamStrelkaBP = pairBamStrelkaBP.map { +// idPatientNormal, idSampleNormal, bamNormal, baiNormal, idSampleTumor, bamTumor, baiTumor -> +// [idPatientNormal, idSampleNormal, idSampleTumor, bamNormal, baiNormal, bamTumor, baiTumor] +// }.join(mantaToStrelka, by:[0,1,2]).map { +// idPatientNormal, idSampleNormal, idSampleTumor, bamNormal, baiNormal, bamTumor, baiTumor, mantaCSI, mantaCSIi -> +// [idPatientNormal, idSampleNormal, bamNormal, baiNormal, idSampleTumor, bamTumor, baiTumor, mantaCSI, mantaCSIi] +// } + +// // STEP STRELKA.3 - SOMATIC PAIR - BEST PRACTICES + +// process StrelkaBP { +// label 'cpus_max' +// label 'memory_max' + +// tag "${idSampleTumor}_vs_${idSampleNormal}" + +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/Strelka", mode: params.publish_dir_mode + +// input: +// set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(mantaCSI), file(mantaCSIi) from pairBamStrelkaBP +// file(dict) from ch_dict +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai +// file(targetBED) from ch_target_bed + +// output: +// set val("Strelka"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfStrelkaBP + +// when: 'strelka' in tools && 'manta' in tools && !params.no_strelka_bp + +// script: +// beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" +// options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" +// """ +// ${beforeScript} +// configureStrelkaSomaticWorkflow.py \ +// --tumor ${bamTumor} \ +// --normal ${bamNormal} \ +// --referenceFasta ${fasta} \ +// --indelCandidates ${mantaCSI} \ +// ${options} \ +// --runDir Strelka + +// python Strelka/runWorkflow.py -m local -j ${task.cpus} + +// mv Strelka/results/variants/somatic.indels.vcf.gz \ +// StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz +// mv Strelka/results/variants/somatic.indels.vcf.gz.tbi \ +// StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_indels.vcf.gz.tbi +// mv Strelka/results/variants/somatic.snvs.vcf.gz \ +// StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz +// mv Strelka/results/variants/somatic.snvs.vcf.gz.tbi \ +// StrelkaBP_${idSampleTumor}_vs_${idSampleNormal}_somatic_snvs.vcf.gz.tbi +// """ +// } + +// vcfStrelkaBP = vcfStrelkaBP.dump(tag:'Strelka BP') + +// // STEP CNVkit + +// process CNVkit { +// tag "${idSampleTumor}_vs_${idSampleNormal}" + +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/CNVkit", mode: params.publish_dir_mode + +// input: +// set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamCNVkit +// file(targetBED) from ch_target_bed +// file(fasta) from ch_fasta + +// output: +// set idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}*"), file("${idSampleNormal}*") into cnvkitOut + +// when: 'cnvkit' in tools && params.target_bed + +// script: +// """ +// cnvkit.py \ +// batch \ +// ${bamTumor} \ +// --normal ${bamNormal} \ +// --targets ${targetBED} \ +// --fasta ${fasta} \ +// --output-reference output_reference.cnn \ +// --output-dir ./ \ +// --diagram \ +// --scatter +// """ +// } + +// // STEP MSISENSOR.1 - SCAN + +// // Scan reference genome for microsatellites +// process MSIsensor_scan { +// label 'cpus_1' +// label 'memory_max' - when: 'msisensor' in tools +// tag "${fasta}" - script: - """ - msisensor msi -d ${msiSites} \ - -b 4 \ - -n ${bamNormal} \ - -t ${bamTumor} \ - -o ${idSampleTumor}_vs_${idSampleNormal}_msisensor - """ -} - -// STEP ASCAT.1 - ALLELECOUNTER - -// Run commands and code from Malin Larsson -// Based on Jesper Eisfeldt's code -process AlleleCounter { - label 'memory_singleCPU_2_task' - - tag "${idSample}" - - input: - set idPatient, idSample, file(bam), file(bai) from bamAscat - file(acLoci) from ch_ac_loci - file(dict) from ch_dict - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - - output: - set idPatient, idSample, file("${idSample}.alleleCount") into alleleCounterOut - - when: 'ascat' in tools - - script: - """ - alleleCounter \ - -l ${acLoci} \ - -r ${fasta} \ - -b ${bam} \ - -o ${idSample}.alleleCount; - """ -} - -alleleCountOutNormal = Channel.create() -alleleCountOutTumor = Channel.create() +// input: +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai -alleleCounterOut - .choice(alleleCountOutTumor, alleleCountOutNormal) {statusMap[it[0], it[1]] == 0 ? 1 : 0} +// output: +// file "microsatellites.list" into msi_scan_ch -alleleCounterOut = alleleCountOutNormal.combine(alleleCountOutTumor, by:0) +// when: 'msisensor' in tools -alleleCounterOut = alleleCounterOut.map { - idPatientNormal, idSampleNormal, alleleCountOutNormal, - idSampleTumor, alleleCountOutTumor -> - [idPatientNormal, idSampleNormal, idSampleTumor, alleleCountOutNormal, alleleCountOutTumor] -} +// script: +// """ +// msisensor scan -d ${fasta} -o microsatellites.list +// """ +// } -// STEP ASCAT.2 - CONVERTALLELECOUNTS +// // STEP MSISENSOR.2 - SCORE -// R script from Malin Larssons bitbucket repo: -// https://bitbucket.org/malinlarsson/somatic_wgs_pipeline -process ConvertAlleleCounts { - label 'memory_singleCPU_2_task' +// // Score the normal vs somatic pair of bams - tag "${idSampleTumor}_vs_${idSampleNormal}" +// process MSIsensor_msi { +// label 'cpus_4' +// label 'memory_max' - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/ASCAT", mode: params.publish_dir_mode +// tag "${idSampleTumor}_vs_${idSampleNormal}" - input: - set idPatient, idSampleNormal, idSampleTumor, file(alleleCountNormal), file(alleleCountTumor) from alleleCounterOut +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/MSIsensor", mode: params.publish_dir_mode - output: - set idPatient, idSampleNormal, idSampleTumor, file("${idSampleNormal}.BAF"), file("${idSampleNormal}.LogR"), file("${idSampleTumor}.BAF"), file("${idSampleTumor}.LogR") into convertAlleleCountsOut +// input: +// set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamMsisensor +// file msiSites from msi_scan_ch - when: 'ascat' in tools +// output: +// set val("Msisensor"), idPatient, file("${idSampleTumor}_vs_${idSampleNormal}_msisensor"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_dis"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_germline"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_somatic") into msisensor_out_ch - script: - gender = genderMap[idPatient] - """ - convertAlleleCounts.r ${idSampleTumor} ${alleleCountTumor} ${idSampleNormal} ${alleleCountNormal} ${gender} - """ -} +// when: 'msisensor' in tools -// STEP ASCAT.3 - ASCAT - -// R scripts from Malin Larssons bitbucket repo: -// https://bitbucket.org/malinlarsson/somatic_wgs_pipeline -process Ascat { - label 'memory_singleCPU_2_task' - - tag "${idSampleTumor}_vs_${idSampleNormal}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/ASCAT", mode: params.publish_dir_mode - - input: - set idPatient, idSampleNormal, idSampleTumor, file(bafNormal), file(logrNormal), file(bafTumor), file(logrTumor) from convertAlleleCountsOut - file(acLociGC) from ch_ac_loci_gc - - output: - set val("ASCAT"), idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}.*.{png,txt}") into ascatOut - - when: 'ascat' in tools - - script: - gender = genderMap[idPatient] - purity_ploidy = (params.ascat_purity && params.ascat_ploidy) ? "--purity ${params.ascat_purity} --ploidy ${params.ascat_ploidy}" : "" - """ - for f in *BAF *LogR; do sed 's/chr//g' \$f > tmpFile; mv tmpFile \$f;done - run_ascat.r \ - --tumorbaf ${bafTumor} \ - --tumorlogr ${logrTumor} \ - --normalbaf ${bafNormal} \ - --normallogr ${logrNormal} \ - --tumorname ${idSampleTumor} \ - --basedir ${baseDir} \ - --gcfile ${acLociGC} \ - --gender ${gender} \ - ${purity_ploidy} - """ -} +// script: +// """ +// msisensor msi -d ${msiSites} \ +// -b 4 \ +// -n ${bamNormal} \ +// -t ${bamTumor} \ +// -o ${idSampleTumor}_vs_${idSampleNormal}_msisensor +// """ +// } -ascatOut.dump(tag:'ASCAT') +// // STEP ASCAT.1 - ALLELECOUNTER -// STEP MPILEUP.1 +// // Run commands and code from Malin Larsson +// // Based on Jesper Eisfeldt's code +// process AlleleCounter { +// label 'memory_singleCPU_2_task' -process Mpileup { - label 'cpus_1' - label 'memory_singleCPU_2_task' +// tag "${idSample}" - tag "${idSample}-${intervalBed.baseName}" +// input: +// set idPatient, idSample, file(bam), file(bai) from bamAscat +// file(acLoci) from ch_ac_loci +// file(dict) from ch_dict +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai - publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { it == "${idSample}.pileup" ? "VariantCalling/${idSample}/Control-FREEC/${it}" : null } +// output: +// set idPatient, idSample, file("${idSample}.alleleCount") into alleleCounterOut - input: - set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamMpileup - file(fasta) from ch_fasta - file(fastaFai) from ch_fai +// when: 'ascat' in tools - output: - set idPatient, idSample, file("${prefix}${idSample}.pileup") into mpileupMerge - set idPatient, idSample into tsv_mpileup +// script: +// """ +// alleleCounter \ +// -l ${acLoci} \ +// -r ${fasta} \ +// -b ${bam} \ +// -o ${idSample}.alleleCount; +// """ +// } - when: 'controlfreec' in tools || 'mpileup' in tools +// alleleCountOutNormal = Channel.create() +// alleleCountOutTumor = Channel.create() - script: - prefix = params.no_intervals ? "" : "${intervalBed.baseName}_" - intervalsOptions = params.no_intervals ? "" : "-l ${intervalBed}" +// alleleCounterOut +// .choice(alleleCountOutTumor, alleleCountOutNormal) {statusMap[it[0], it[1]] == 0 ? 1 : 0} - """ - # Control-FREEC reads uncompresses the zipped file TWICE in single-threaded mode. - # we are therefore not using compressed pileups here - samtools mpileup \ - -f ${fasta} ${bam} \ - ${intervalsOptions} > ${prefix}${idSample}.pileup - """ -} +// alleleCounterOut = alleleCountOutNormal.combine(alleleCountOutTumor, by:0) -(tsv_mpileup, tsv_mpileup_sample) = tsv_mpileup.groupTuple(by:[0, 1]).into(2) - -// Creating a TSV file to restart from this step -tsv_mpileup.map { idPatient, idSample -> - gender = genderMap[idPatient] - status = statusMap[idPatient, idSample] - mpileup = "${params.outdir}/VariantCalling/${idSample}/Control-FREEC/${idSample}.pileup" - "${idPatient}\t${gender}\t${status}\t${idSample}\t${mpileup}\n" -}.collectFile( - name: 'control-freec_mpileup.tsv', sort: true, storeDir: "${params.outdir}/VariantCalling/TSV" -) - -tsv_mpileup_sample - .collectFile(storeDir: "${params.outdir}/VariantCalling/TSV") { - idPatient, idSample -> - status = statusMap[idPatient, idSample] - gender = genderMap[idPatient] - mpileup = "${params.outdir}/VariantCalling/${idSample}/Control-FREEC/${idSample}.pileup" - ["control-freec_mpileup_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${mpileup}\n"] -} +// alleleCounterOut = alleleCounterOut.map { +// idPatientNormal, idSampleNormal, alleleCountOutNormal, +// idSampleTumor, alleleCountOutTumor -> +// [idPatientNormal, idSampleNormal, idSampleTumor, alleleCountOutNormal, alleleCountOutTumor] +// } -if (!params.no_intervals) { - mpileupMerge = mpileupMerge.groupTuple(by:[0, 1]) - mpileupNoInt = Channel.empty() -} else { - (mpileupMerge, mpileupNoInt) = mpileupMerge.into(2) - mpileupMerge.close() -} +// // STEP ASCAT.2 - CONVERTALLELECOUNTS -// STEP MPILEUP.2 - MERGE -process MergeMpileup { - label 'cpus_1' +// // R script from Malin Larssons bitbucket repo: +// // https://bitbucket.org/malinlarsson/somatic_wgs_pipeline +// process ConvertAlleleCounts { +// label 'memory_singleCPU_2_task' - tag "${idSample}" - - publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { it == "${idSample}.pileup" ? "VariantCalling/${idSample}/Control-FREEC/${it}" : null } - - input: - set idPatient, idSample, file(mpileup) from mpileupMerge - - output: - set idPatient, idSample, file("${idSample}.pileup") into mpileupOut - - when: !(params.no_intervals) && 'controlfreec' in tools || 'mpileup' in tools - - script: - """ - for i in `ls -1v *.pileup`; - do cat \$i >> ${idSample}.pileup - done - """ -} - -mpileupOut = mpileupOut.mix(mpileupNoInt) -mpileupOut = mpileupOut.dump(tag:'mpileup') - -mpileupOutNormal = Channel.create() -mpileupOutTumor = Channel.create() - -if (step == 'controlfreec') mpileupOut = inputSample - -mpileupOut - .choice(mpileupOutTumor, mpileupOutNormal) {statusMap[it[0], it[1]] == 0 ? 1 : 0} - -mpileupOut = mpileupOutNormal.combine(mpileupOutTumor, by:0) - -mpileupOut = mpileupOut.map { - idPatientNormal, idSampleNormal, mpileupOutNormal, - idSampleTumor, mpileupOutTumor -> - [idPatientNormal, idSampleNormal, idSampleTumor, mpileupOutNormal, mpileupOutTumor] -} - -// STEP CONTROLFREEC.1 - CONTROLFREEC - -process ControlFREEC { - label 'cpus_max' - //label 'memory_singleCPU_2_task' - - tag "${idSampleTumor}_vs_${idSampleNormal}" - - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/Control-FREEC", mode: params.publish_dir_mode - - input: - set idPatient, idSampleNormal, idSampleTumor, file(mpileupNormal), file(mpileupTumor) from mpileupOut - file(chrDir) from ch_chr_dir - file(mappability) from ch_mappability - file(chrLength) from ch_chr_length - file(dbsnp) from ch_dbsnp - file(dbsnpIndex) from ch_dbsnp_tbi - file(fasta) from ch_fasta - file(fastaFai) from ch_fai - - output: - set idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}.pileup_CNVs"), file("${idSampleTumor}.pileup_ratio.txt"), file("${idSampleTumor}.pileup_normal_CNVs"), file("${idSampleTumor}.pileup_normal_ratio.txt"), file("${idSampleTumor}.pileup_BAF.txt"), file("${idSampleNormal}.pileup_BAF.txt") into controlFreecViz - set file("*.pileup*"), file("${idSampleTumor}_vs_${idSampleNormal}.config.txt") into controlFreecOut - - when: 'controlfreec' in tools - - script: - config = "${idSampleTumor}_vs_${idSampleNormal}.config.txt" - gender = genderMap[idPatient] - // if we are using coefficientOfVariation, we must delete the window parameter - // it is "window = 20000" in the default settings, without coefficientOfVariation set, - // but we do not like it. Note, it is not written in stone - coeff_or_window = params.cf_window ? "window = ${params.cf_window}" : "coefficientOfVariation = ${params.cf_coeff}" - - """ - touch ${config} - echo "[general]" >> ${config} - echo "BedGraphOutput = TRUE" >> ${config} - echo "chrFiles = \${PWD}/${chrDir.fileName}" >> ${config} - echo "chrLenFile = \${PWD}/${chrLength.fileName}" >> ${config} - echo "gemMappabilityFile = \${PWD}/${mappability}" >> ${config} - echo "${coeff_or_window}" >> ${config} - echo "contaminationAdjustment = TRUE" >> ${config} - echo "forceGCcontentNormalization = 1" >> ${config} - echo "maxThreads = ${task.cpus}" >> ${config} - echo "minimalSubclonePresence = 20" >> ${config} - echo "ploidy = ${params.cf_ploidy}" >> ${config} - echo "sex = ${gender}" >> ${config} - echo "" >> ${config} - - echo "[control]" >> ${config} - echo "inputFormat = pileup" >> ${config} - echo "mateFile = \${PWD}/${mpileupNormal}" >> ${config} - echo "mateOrientation = FR" >> ${config} - echo "" >> ${config} - - echo "[sample]" >> ${config} - echo "inputFormat = pileup" >> ${config} - echo "mateFile = \${PWD}/${mpileupTumor}" >> ${config} - echo "mateOrientation = FR" >> ${config} - echo "" >> ${config} - - echo "[BAF]" >> ${config} - echo "SNPfile = ${dbsnp.fileName}" >> ${config} - - freec -conf ${config} - """ -} +// tag "${idSampleTumor}_vs_${idSampleNormal}" -controlFreecOut.dump(tag:'ControlFREEC') +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/ASCAT", mode: params.publish_dir_mode -// STEP CONTROLFREEC.3 - VISUALIZATION +// input: +// set idPatient, idSampleNormal, idSampleTumor, file(alleleCountNormal), file(alleleCountTumor) from alleleCounterOut -process ControlFreecViz { - label 'memory_singleCPU_2_task' +// output: +// set idPatient, idSampleNormal, idSampleTumor, file("${idSampleNormal}.BAF"), file("${idSampleNormal}.LogR"), file("${idSampleTumor}.BAF"), file("${idSampleTumor}.LogR") into convertAlleleCountsOut - tag "${idSampleTumor}_vs_${idSampleNormal}" +// when: 'ascat' in tools + +// script: +// gender = genderMap[idPatient] +// """ +// convertAlleleCounts.r ${idSampleTumor} ${alleleCountTumor} ${idSampleNormal} ${alleleCountNormal} ${gender} +// """ +// } + +// // STEP ASCAT.3 - ASCAT + +// // R scripts from Malin Larssons bitbucket repo: +// // https://bitbucket.org/malinlarsson/somatic_wgs_pipeline +// process Ascat { +// label 'memory_singleCPU_2_task' + +// tag "${idSampleTumor}_vs_${idSampleNormal}" + +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/ASCAT", mode: params.publish_dir_mode + +// input: +// set idPatient, idSampleNormal, idSampleTumor, file(bafNormal), file(logrNormal), file(bafTumor), file(logrTumor) from convertAlleleCountsOut +// file(acLociGC) from ch_ac_loci_gc + +// output: +// set val("ASCAT"), idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}.*.{png,txt}") into ascatOut + +// when: 'ascat' in tools + +// script: +// gender = genderMap[idPatient] +// purity_ploidy = (params.ascat_purity && params.ascat_ploidy) ? "--purity ${params.ascat_purity} --ploidy ${params.ascat_ploidy}" : "" +// """ +// for f in *BAF *LogR; do sed 's/chr//g' \$f > tmpFile; mv tmpFile \$f;done +// run_ascat.r \ +// --tumorbaf ${bafTumor} \ +// --tumorlogr ${logrTumor} \ +// --normalbaf ${bafNormal} \ +// --normallogr ${logrNormal} \ +// --tumorname ${idSampleTumor} \ +// --basedir ${baseDir} \ +// --gcfile ${acLociGC} \ +// --gender ${gender} \ +// ${purity_ploidy} +// """ +// } - publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/Control-FREEC", mode: params.publish_dir_mode +// ascatOut.dump(tag:'ASCAT') - input: - set idPatient, idSampleNormal, idSampleTumor, file(cnvTumor), file(ratioTumor), file(cnvNormal), file(ratioNormal), file(bafTumor), file(bafNormal) from controlFreecViz +// // STEP MPILEUP.1 - output: - set file("*.txt"), file("*.png"), file("*.bed") into controlFreecVizOut +// process Mpileup { +// label 'cpus_1' +// label 'memory_singleCPU_2_task' - when: 'controlfreec' in tools +// tag "${idSample}-${intervalBed.baseName}" - """ - echo "Shaping CNV files to make sure we can assess significance" - awk 'NF==9{print}' ${cnvTumor} > TUMOR.CNVs - awk 'NF==7{print}' ${cnvNormal} > NORMAL.CNVs +// publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { it == "${idSample}.pileup" ? "VariantCalling/${idSample}/Control-FREEC/${it}" : null } - echo "############### Calculating significance values for TUMOR CNVs #############" - cat /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/assess_significance.R | R --slave --args TUMOR.CNVs ${ratioTumor} +// input: +// set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamMpileup +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai - echo "############### Calculating significance values for NORMAL CNVs ############" - cat /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/assess_significance.R | R --slave --args NORMAL.CNVs ${ratioNormal} +// output: +// set idPatient, idSample, file("${prefix}${idSample}.pileup") into mpileupMerge +// set idPatient, idSample into tsv_mpileup - echo "############### Creating graph for TUMOR ratios ###############" - cat /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/makeGraph.R | R --slave --args 2 ${ratioTumor} ${bafTumor} +// when: 'controlfreec' in tools || 'mpileup' in tools - echo "############### Creating graph for NORMAL ratios ##############" - cat /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/makeGraph.R | R --slave --args 2 ${ratioNormal} ${bafNormal} +// script: +// prefix = params.no_intervals ? "" : "${intervalBed.baseName}_" +// intervalsOptions = params.no_intervals ? "" : "-l ${intervalBed}" - echo "############### Creating BED files for TUMOR ##############" - perl /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/freec2bed.pl -f ${ratioTumor} > ${idSampleTumor}.bed +// """ +// # Control-FREEC reads uncompresses the zipped file TWICE in single-threaded mode. +// # we are therefore not using compressed pileups here +// samtools mpileup \ +// -f ${fasta} ${bam} \ +// ${intervalsOptions} > ${prefix}${idSample}.pileup +// """ +// } - echo "############### Creating BED files for NORMAL #############" - perl /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/freec2bed.pl -f ${ratioNormal} > ${idSampleNormal}.bed - """ -} - -controlFreecVizOut.dump(tag:'ControlFreecViz') - -// Remapping channels for QC and annotation - -(vcfStrelkaIndels, vcfStrelkaSNVS) = vcfStrelka.into(2) -(vcfStrelkaBPIndels, vcfStrelkaBPSNVS) = vcfStrelkaBP.into(2) -(vcfMantaSomaticSV, vcfMantaDiploidSV) = vcfManta.into(2) - -vcfKeep = Channel.empty().mix( - filteredMutect2Output.map{ - variantCaller, idPatient, idSample, vcf, tbi, tsv -> - [variantcaller, idSample, vcf] - }, - vcfConcatenated.map{ - variantcaller, idPatient, idSample, vcf, tbi -> - [variantcaller, idSample, vcf] - }, - vcf_sentieon_compressed.map { - variantcaller, idPatient, idSample, vcf, tbi -> - [variantcaller, idSample, vcf] - }, - vcfStrelkaSingle.map { - variantcaller, idPatient, idSample, vcf, tbi -> - [variantcaller, idSample, vcf[1]] - }, - vcfMantaSingle.map { - variantcaller, idPatient, idSample, vcf, tbi -> - [variantcaller, idSample, vcf[2]] - }, - vcfMantaDiploidSV.map { - variantcaller, idPatient, idSample, vcf, tbi -> - [variantcaller, idSample, vcf[2]] - }, - vcfMantaSomaticSV.map { - variantcaller, idPatient, idSample, vcf, tbi -> - [variantcaller, idSample, vcf[3]] - }, - vcfStrelkaIndels.map { - variantcaller, idPatient, idSample, vcf, tbi -> - [variantcaller, idSample, vcf[0]] - }, - vcfStrelkaSNVS.map { - variantcaller, idPatient, idSample, vcf, tbi -> - [variantcaller, idSample, vcf[1]] - }, - vcfStrelkaBPIndels.map { - variantcaller, idPatient, idSample, vcf, tbi -> - [variantcaller, idSample, vcf[0]] - }, - vcfStrelkaBPSNVS.map { - variantcaller, idPatient, idSample, vcf, tbi -> - [variantcaller, idSample, vcf[1]] - }, - vcfTIDDIT.map { - variantcaller, idPatient, idSample, vcf, tbi -> - [variantcaller, idSample, vcf] - }) - -(vcfBCFtools, vcfVCFtools, vcfAnnotation) = vcfKeep.into(3) - -// STEP VCF.QC - -process BcftoolsStats { - label 'cpus_1' - - tag "${variantCaller} - ${vcf}" - - publishDir "${params.outdir}/Reports/${idSample}/BCFToolsStats", mode: params.publish_dir_mode - - input: - set variantCaller, idSample, file(vcf) from vcfBCFtools - - output: - file ("*.bcf.tools.stats.out") into bcftoolsReport - - when: !('bcftools' in skipQC) - - script: - """ - bcftools stats ${vcf} > ${reduceVCF(vcf.fileName)}.bcf.tools.stats.out - """ -} - -bcftoolsReport = bcftoolsReport.dump(tag:'BCFTools') - -process Vcftools { - label 'cpus_1' - - tag "${variantCaller} - ${vcf}" - - publishDir "${params.outdir}/Reports/${idSample}/VCFTools", mode: params.publish_dir_mode - - input: - set variantCaller, idSample, file(vcf) from vcfVCFtools - - output: - file ("${reduceVCF(vcf.fileName)}.*") into vcftoolsReport - - when: !('vcftools' in skipQC) - - script: - """ - vcftools \ - --gzvcf ${vcf} \ - --TsTv-by-count \ - --out ${reduceVCF(vcf.fileName)} - - vcftools \ - --gzvcf ${vcf} \ - --TsTv-by-qual \ - --out ${reduceVCF(vcf.fileName)} - - vcftools \ - --gzvcf ${vcf} \ - --FILTER-summary \ - --out ${reduceVCF(vcf.fileName)} - """ -} - -vcftoolsReport = vcftoolsReport.dump(tag:'VCFTools') - -/* -================================================================================ - ANNOTATION -================================================================================ -*/ - -if (step == 'annotate') { - vcfToAnnotate = Channel.create() - vcfNoAnnotate = Channel.create() - - if (tsvPath == []) { - // Sarek, by default, annotates all available vcfs that it can find in the VariantCalling directory - // Excluding vcfs from FreeBayes, and g.vcf from HaplotypeCaller - // Basically it's: results/VariantCalling/*/{HaplotypeCaller,Manta,Mutect2,SentieonDNAseq,SentieonDNAscope,SentieonTNscope,Strelka,TIDDIT}/*.vcf.gz - // Without *SmallIndels.vcf.gz from Manta, and *.genome.vcf.gz from Strelka - // The small snippet `vcf.minus(vcf.fileName)[-2]` catches idSample - // This field is used to output final annotated VCFs in the correct directory - Channel.empty().mix( - Channel.fromPath("${params.outdir}/VariantCalling/*/HaplotypeCaller/*.vcf.gz") - .flatten().map{vcf -> ['HaplotypeCaller', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, - Channel.fromPath("${params.outdir}/VariantCalling/*/Manta/*[!candidate]SV.vcf.gz") - .flatten().map{vcf -> ['Manta', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, - Channel.fromPath("${params.outdir}/VariantCalling/*/Mutect2/*.vcf.gz") - .flatten().map{vcf -> ['Mutect2', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, - Channel.fromPath("${params.outdir}/VariantCalling/*/SentieonDNAseq/*.vcf.gz") - .flatten().map{vcf -> ['SentieonDNAseq', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, - Channel.fromPath("${params.outdir}/VariantCalling/*/SentieonDNAscope/*.vcf.gz") - .flatten().map{vcf -> ['SentieonDNAscope', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, - Channel.fromPath("${params.outdir}/VariantCalling/*/SentieonTNscope/*.vcf.gz") - .flatten().map{vcf -> ['SentieonTNscope', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, - Channel.fromPath("${params.outdir}/VariantCalling/*/Strelka/*{somatic,variant}*.vcf.gz") - .flatten().map{vcf -> ['Strelka', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, - Channel.fromPath("${params.outdir}/VariantCalling/*/TIDDIT/*.vcf.gz") - .flatten().map{vcf -> ['TIDDIT', vcf.minus(vcf.fileName)[-2].toString(), vcf]} - ).choice(vcfToAnnotate, vcfNoAnnotate) { - annotateTools == [] || (annotateTools != [] && it[0] in annotateTools) ? 0 : 1 - } - } else if (annotateTools == []) { - // Annotate user-submitted VCFs - // If user-submitted, Sarek assume that the idSample should be assumed automatically - vcfToAnnotate = Channel.fromPath(tsvPath) - .map{vcf -> ['userspecified', vcf.minus(vcf.fileName)[-2].toString(), vcf]} - } else exit 1, "specify only tools or files to annotate, not both" - - vcfNoAnnotate.close() - vcfAnnotation = vcfAnnotation.mix(vcfToAnnotate) -} - -// as now have the list of VCFs to annotate, the first step is to annotate with allele frequencies, if there are any - -(vcfSnpeff, vcfVep) = vcfAnnotation.into(2) - -vcfVep = vcfVep.map { - variantCaller, idSample, vcf -> - [variantCaller, idSample, vcf, null] -} - -// STEP SNPEFF - -process Snpeff { - tag "${idSample} - ${variantCaller} - ${vcf}" - - publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { - if (it == "${reducedVCF}_snpEff.ann.vcf") null - else "Reports/${idSample}/snpEff/${it}" - } +// (tsv_mpileup, tsv_mpileup_sample) = tsv_mpileup.groupTuple(by:[0, 1]).into(2) - input: - set variantCaller, idSample, file(vcf) from vcfSnpeff - file(dataDir) from ch_snpeff_cache - val snpeffDb from ch_snpeff_db - - output: - set file("${reducedVCF}_snpEff.genes.txt"), file("${reducedVCF}_snpEff.html"), file("${reducedVCF}_snpEff.csv") into snpeffReport - set variantCaller, idSample, file("${reducedVCF}_snpEff.ann.vcf") into snpeffVCF - - when: 'snpeff' in tools || 'merge' in tools - - script: - reducedVCF = reduceVCF(vcf.fileName) - cache = (params.snpeff_cache && params.annotation_cache) ? "-dataDir \${PWD}/${dataDir}" : "" - """ - snpEff -Xmx${task.memory.toGiga()}g \ - ${snpeffDb} \ - -csvStats ${reducedVCF}_snpEff.csv \ - -nodownload \ - ${cache} \ - -canon \ - -v \ - ${vcf} \ - > ${reducedVCF}_snpEff.ann.vcf - - mv snpEff_summary.html ${reducedVCF}_snpEff.html - """ -} - -snpeffReport = snpeffReport.dump(tag:'snpEff report') - -// STEP COMPRESS AND INDEX VCF.1 - SNPEFF - -process CompressVCFsnpEff { - tag "${idSample} - ${vcf}" - - publishDir "${params.outdir}/Annotation/${idSample}/snpEff", mode: params.publish_dir_mode - - input: - set variantCaller, idSample, file(vcf) from snpeffVCF - - output: - set variantCaller, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into (compressVCFsnpEffOut) - - script: - """ - bgzip < ${vcf} > ${vcf}.gz - tabix ${vcf}.gz - """ -} - -compressVCFsnpEffOut = compressVCFsnpEffOut.dump(tag:'VCF') - -// STEP VEP.1 - -process VEP { - label 'VEP' - label 'cpus_4' - - tag "${idSample} - ${variantCaller} - ${vcf}" - - publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { - if (it == "${reducedVCF}_VEP.summary.html") "Reports/${idSample}/VEP/${it}" - else null - } - - input: - set variantCaller, idSample, file(vcf), file(idx) from vcfVep - file(dataDir) from ch_vep_cache - val cache_version from ch_vep_cache_version - file(cadd_InDels) from ch_cadd_indels - file(cadd_InDels_tbi) from ch_cadd_indels_tbi - file(cadd_WG_SNVs) from ch_cadd_wg_snvs - file(cadd_WG_SNVs_tbi) from ch_cadd_wg_snvs_tbi - - output: - set variantCaller, idSample, file("${reducedVCF}_VEP.ann.vcf") into vepVCF - file("${reducedVCF}_VEP.summary.html") into vepReport - - when: 'vep' in tools - - script: - reducedVCF = reduceVCF(vcf.fileName) - genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome - - dir_cache = (params.vep_cache && params.annotation_cache) ? " \${PWD}/${dataDir}" : "/.vep" - cadd = (params.cadd_cache && params.cadd_wg_snvs && params.cadd_indels) ? "--plugin CADD,whole_genome_SNVs.tsv.gz,InDels.tsv.gz" : "" - genesplicer = params.genesplicer ? "--plugin GeneSplicer,/opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/genesplicer,/opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/share/genesplicer-1.0-1/human,context=200,tmpdir=\$PWD/${reducedVCF}" : "--offline" - """ - mkdir ${reducedVCF} - - vep \ - -i ${vcf} \ - -o ${reducedVCF}_VEP.ann.vcf \ - --assembly ${genome} \ - --species ${params.species} \ - ${cadd} \ - ${genesplicer} \ - --cache \ - --cache_version ${cache_version} \ - --dir_cache ${dir_cache} \ - --everything \ - --filter_common \ - --fork ${task.cpus} \ - --format vcf \ - --per_gene \ - --stats_file ${reducedVCF}_VEP.summary.html \ - --total_length \ - --vcf - - rm -rf ${reducedVCF} - """ -} - -vepReport = vepReport.dump(tag:'VEP') - -// STEP VEP.2 - VEP AFTER SNPEFF - -process VEPmerge { - label 'VEP' - label 'cpus_4' - - tag "${idSample} - ${variantCaller} - ${vcf}" - - publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { - if (it == "${reducedVCF}_VEP.summary.html") "Reports/${idSample}/VEP/${it}" - else null - } - - input: - set variantCaller, idSample, file(vcf), file(idx) from compressVCFsnpEffOut - file(dataDir) from ch_vep_cache - val cache_version from ch_vep_cache_version - file(cadd_InDels) from ch_cadd_indels - file(cadd_InDels_tbi) from ch_cadd_indels_tbi - file(cadd_WG_SNVs) from ch_cadd_wg_snvs - file(cadd_WG_SNVs_tbi) from ch_cadd_wg_snvs_tbi - - output: - set variantCaller, idSample, file("${reducedVCF}_VEP.ann.vcf") into vepVCFmerge - file("${reducedVCF}_VEP.summary.html") into vepReportMerge - - when: 'merge' in tools - - script: - reducedVCF = reduceVCF(vcf.fileName) - genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome - dir_cache = (params.vep_cache && params.annotation_cache) ? " \${PWD}/${dataDir}" : "/.vep" - cadd = (params.cadd_cache && params.cadd_wg_snvs && params.cadd_indels) ? "--plugin CADD,whole_genome_SNVs.tsv.gz,InDels.tsv.gz" : "" - genesplicer = params.genesplicer ? "--plugin GeneSplicer,/opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/genesplicer,/opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/share/genesplicer-1.0-1/human,context=200,tmpdir=\$PWD/${reducedVCF}" : "--offline" - """ - mkdir ${reducedVCF} - - vep \ - -i ${vcf} \ - -o ${reducedVCF}_VEP.ann.vcf \ - --assembly ${genome} \ - --species ${params.species} \ - ${cadd} \ - ${genesplicer} \ - --cache \ - --cache_version ${cache_version} \ - --dir_cache ${dir_cache} \ - --everything \ - --filter_common \ - --fork ${task.cpus} \ - --format vcf \ - --per_gene \ - --stats_file ${reducedVCF}_VEP.summary.html \ - --total_length \ - --vcf - - rm -rf ${reducedVCF} - """ -} - -vepReportMerge = vepReportMerge.dump(tag:'VEP') - -vcfCompressVCFvep = vepVCF.mix(vepVCFmerge) - -// STEP COMPRESS AND INDEX VCF.2 - VEP - -process CompressVCFvep { - tag "${idSample} - ${vcf}" - - publishDir "${params.outdir}/Annotation/${idSample}/VEP", mode: params.publish_dir_mode - - input: - set variantCaller, idSample, file(vcf) from vcfCompressVCFvep - - output: - set variantCaller, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into compressVCFOutVEP - - script: - """ - bgzip < ${vcf} > ${vcf}.gz - tabix ${vcf}.gz - """ -} - -compressVCFOutVEP = compressVCFOutVEP.dump(tag:'VCF') - -/* -================================================================================ - MultiQC -================================================================================ -*/ - -// STEP MULTIQC - -process MultiQC { - publishDir "${params.outdir}/Reports/MultiQC", mode: params.publish_dir_mode - - input: - file (multiqcConfig) from ch_multiqc_config - file (mqc_custom_config) from ch_multiqc_custom_config.collect().ifEmpty([]) - file (versions) from ch_software_versions_yaml.collect() - file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml") - file ('bamQC/*') from bamQCReport.collect().ifEmpty([]) - file ('BCFToolsStats/*') from bcftoolsReport.collect().ifEmpty([]) - file ('FastQC/*') from fastQCReport.collect().ifEmpty([]) - file ('TrimmedFastQC/*') from trimGaloreReport.collect().ifEmpty([]) - file ('MarkDuplicates/*') from duplicates_marked_report.collect().ifEmpty([]) - file ('DuplicatesMarked/*.recal.table') from baseRecalibratorReport.collect().ifEmpty([]) - file ('SamToolsStats/*') from samtoolsStatsReport.collect().ifEmpty([]) - file ('snpEff/*') from snpeffReport.collect().ifEmpty([]) - file ('VCFTools/*') from vcftoolsReport.collect().ifEmpty([]) - - output: - file "*multiqc_report.html" into ch_multiqc_report - file "*_data" - file "multiqc_plots" - - when: !('multiqc' in skipQC) - - script: - rtitle = custom_runName ? "--title \"$custom_runName\"" : '' - rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' - custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : '' - """ - multiqc -f ${rtitle} ${rfilename} ${custom_config_file} . - """ -} - -ch_multiqc_report.dump(tag:'MultiQC') - -// Output Description HTML -process Output_documentation { - publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode - - input: - file output_docs from ch_output_docs - - output: - file "results_description.html" - - when: !('documentation' in skipQC) - - script: - """ - markdown_to_html.py $output_docs -o results_description.html - """ -} - -// Completion e-mail notification -workflow.onComplete { - - // Set up the e-mail variables - def subject = "[nf-core/sarek] Successful: $workflow.runName" - if (!workflow.success) { - subject = "[nf-core/sarek] FAILED: $workflow.runName" - } - def email_fields = [:] - email_fields['version'] = workflow.manifest.version - email_fields['runName'] = custom_runName ?: workflow.runName - email_fields['success'] = workflow.success - email_fields['dateComplete'] = workflow.complete - email_fields['duration'] = workflow.duration - email_fields['exitStatus'] = workflow.exitStatus - email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') - email_fields['errorReport'] = (workflow.errorReport ?: 'None') - email_fields['commandLine'] = workflow.commandLine - email_fields['projectDir'] = workflow.projectDir - email_fields['summary'] = summary - email_fields['summary']['Date Started'] = workflow.start - email_fields['summary']['Date Completed'] = workflow.complete - email_fields['summary']['Pipeline script file path'] = workflow.scriptFile - email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId - if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository - if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId - if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision - email_fields['summary']['Nextflow Version'] = workflow.nextflow.version - email_fields['summary']['Nextflow Build'] = workflow.nextflow.build - email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp - - def mqc_report = null - try { - if (workflow.success) { - mqc_report = ch_multiqc_report.getVal() - if (mqc_report.getClass() == ArrayList) { - log.warn "[nf-core/sarek] Found multiple reports from process 'multiqc', will use only one" - mqc_report = mqc_report[0] - } - } - } catch (all) { - log.warn "[nf-core/sarek] Could not attach MultiQC report to summary email" - } - - // Check if we are only sending emails on failure - email_address = params.email - if (!params.email && params.email_on_fail && !workflow.success) { - email_address = params.email_on_fail - } - - // Render the TXT template - def engine = new groovy.text.GStringTemplateEngine() - def tf = new File("$baseDir/assets/email_template.txt") - def txt_template = engine.createTemplate(tf).make(email_fields) - def email_txt = txt_template.toString() - - // Render the HTML template - def hf = new File("$baseDir/assets/email_template.html") - def html_template = engine.createTemplate(hf).make(email_fields) - def email_html = html_template.toString() - - // Render the sendmail template - def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$baseDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] - def sf = new File("$baseDir/assets/sendmail_template.txt") - def sendmail_template = engine.createTemplate(sf).make(smail_fields) - def sendmail_html = sendmail_template.toString() - - // Send the HTML e-mail - if (email_address) { - try { - if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } - // Try to send HTML e-mail using sendmail - [ 'sendmail', '-t' ].execute() << sendmail_html - log.info "[nf-core/sarek] Sent summary e-mail to $email_address (sendmail)" - } catch (all) { - // Catch failures and try with plaintext - [ 'mail', '-s', subject, email_address ].execute() << email_txt - log.info "[nf-core/sarek] Sent summary e-mail to $email_address (mail)" - } - } - - // Write summary e-mail HTML to a file - def output_d = new File("${params.outdir}/pipeline_info/") - if (!output_d.exists()) { - output_d.mkdirs() - } - def output_hf = new File(output_d, "pipeline_report.html") - output_hf.withWriter { w -> w << email_html } - def output_tf = new File(output_d, "pipeline_report.txt") - output_tf.withWriter { w -> w << email_txt } - - c_green = params.monochrome_logs ? '' : "\033[0;32m"; - c_purple = params.monochrome_logs ? '' : "\033[0;35m"; - c_red = params.monochrome_logs ? '' : "\033[0;31m"; - c_reset = params.monochrome_logs ? '' : "\033[0m"; - - if (workflow.stats.ignoredCount > 0 && workflow.success) { - log.info "-${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}-" - log.info "-${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}-" - log.info "-${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}-" - } - - if (workflow.success) { - log.info "-${c_purple}[nf-core/sarek]${c_green} Pipeline completed successfully${c_reset}-" - } else { - checkHostname() - log.info "-${c_purple}[nf-core/sarek]${c_red} Pipeline completed with errors${c_reset}-" - } -} - -/* -================================================================================ - nf-core functions -================================================================================ -*/ - -def create_workflow_summary(summary) { - def yaml_file = workDir.resolve('workflow_summary_mqc.yaml') - yaml_file.text = """ - id: 'nf-core-sarek-summary' - description: " - this information is collected when the pipeline is started." - section_name: 'nf-core/sarek Workflow Summary' - section_href: 'https://github.com/nf-core/sarek' - plot_type: 'html' - data: | -
-${summary.collect { k, v -> "
$k
${v ?: 'N/A'}
" }.join("\n")} -
- """.stripIndent() - - return yaml_file -} - -def nfcoreHeader() { - // Log colors ANSI codes - c_black = params.monochrome_logs ? '' : "\033[0;30m"; - c_blue = params.monochrome_logs ? '' : "\033[0;34m"; - c_dim = params.monochrome_logs ? '' : "\033[2m"; - c_green = params.monochrome_logs ? '' : "\033[0;32m"; - c_purple = params.monochrome_logs ? '' : "\033[0;35m"; - c_reset = params.monochrome_logs ? '' : "\033[0m"; - c_white = params.monochrome_logs ? '' : "\033[0;37m"; - c_yellow = params.monochrome_logs ? '' : "\033[0;33m"; - - return """ -${c_dim}--------------------------------------------------${c_reset}- - ${c_green},--.${c_black}/${c_green},-.${c_reset} - ${c_blue} ___ __ __ __ ___ ${c_green}/,-._.--~\'${c_reset} - ${c_blue} |\\ | |__ __ / ` / \\ |__) |__ ${c_yellow}} {${c_reset} - ${c_blue} | \\| | \\__, \\__/ | \\ |___ ${c_green}\\`-._,-`-,${c_reset} - ${c_green}`._,._,\'${c_reset} - ${c_white}____${c_reset} - ${c_white}.´ _ `.${c_reset} - ${c_white}/ ${c_green}|\\${c_reset}`-_ \\${c_reset} ${c_blue} __ __ ___ ${c_reset} - ${c_white}| ${c_green}| \\${c_reset} `-|${c_reset} ${c_blue}|__` /\\ |__) |__ |__/${c_reset} - ${c_white}\\ ${c_green}| \\${c_reset} /${c_reset} ${c_blue}.__| /¯¯\\ | \\ |___ | \\${c_reset} - ${c_white}`${c_green}|${c_reset}____${c_green}\\${c_reset}´${c_reset} - - ${c_purple} nf-core/sarek v${workflow.manifest.version}${c_reset} - -${c_dim}--------------------------------------------------${c_reset}- - """.stripIndent() -} - -def checkHostname() { - def c_reset = params.monochrome_logs ? '' : "\033[0m" - def c_white = params.monochrome_logs ? '' : "\033[0;37m" - def c_red = params.monochrome_logs ? '' : "\033[1;91m" - def c_yellow_bold = params.monochrome_logs ? '' : "\033[1;93m" - if (params.hostnames) { - def hostname = "hostname".execute().text.trim() - params.hostnames.each { prof, hnames -> - hnames.each { hname -> - if (hostname.contains(hname) && !workflow.profile.contains(prof)) { - log.error "====================================================\n" + - " ${c_red}WARNING!${c_reset} You are running with `-profile $workflow.profile`\n" + - " but your machine hostname is ${c_white}'$hostname'${c_reset}\n" + - " ${c_yellow_bold}It's highly recommended that you use `-profile $prof${c_reset}`\n" + - "============================================================" - } - } - } - } -} - -/* -================================================================================ - sarek functions -================================================================================ -*/ - -// Check if a row has the expected number of item -def checkNumberOfItem(row, number) { - if (row.size() != number) exit 1, "Malformed row in TSV file: ${row}, see --help for more information" - return true -} - -// Check parameter existence -def checkParameterExistence(it, list) { - if (!list.contains(it)) { - log.warn "Unknown parameter: ${it}" - return false - } - return true -} - -// Compare each parameter with a list of parameters -def checkParameterList(list, realList) { - return list.every{ checkParameterExistence(it, realList) } -} - -// Define list of available tools to annotate -def defineAnnoList() { - return [ - 'haplotypecaller', - 'manta', - 'mutect2', - 'strelka', - 'tiddit' - ] -} - -// Define list of skipable QC tools -def defineSkipQClist() { - return [ - 'bamqc', - 'baserecalibrator', - 'bcftools', - 'documentation', - 'fastqc', - 'markduplicates', - 'multiqc', - 'samtools', - 'sentieon', - 'vcftools', - 'versions' - ] -} - -// Define list of available step -def defineStepList() { - return [ - 'annotate', - 'controlfreec', - 'mapping', - 'preparerecalibration', - 'recalibrate', - 'variantcalling' - ] -} - -// Define list of available tools -def defineToolList() { - return [ - 'ascat', - 'cnvkit', - 'controlfreec', - 'dnascope', - 'dnaseq', - 'freebayes', - 'haplotypecaller', - 'manta', - 'merge', - 'mpileup', - 'mutect2', - 'snpeff', - 'strelka', - 'tiddit', - 'tnscope', - 'vep', - 'msisensor' - ] -} - -// Channeling the TSV file containing BAM. -// Format is: "subject gender status sample bam bai" -def extractBam(tsvFile) { - Channel.from(tsvFile) - .splitCsv(sep: '\t') - .map { row -> - checkNumberOfItem(row, 6) - def idPatient = row[0] - def gender = row[1] - def status = returnStatus(row[2].toInteger()) - def idSample = row[3] - def bamFile = returnFile(row[4]) - def baiFile = returnFile(row[5]) - - if (!hasExtension(bamFile, "bam")) exit 1, "File: ${bamFile} has the wrong extension. See --help for more information" - if (!hasExtension(baiFile, "bai")) exit 1, "File: ${baiFile} has the wrong extension. See --help for more information" - - return [idPatient, gender, status, idSample, bamFile, baiFile] - } -} - -// Create a channel of germline FASTQs from a directory pattern: "my_samples/*/" -// All FASTQ files in subdirectories are collected and emitted; -// they must have _R1_ and _R2_ in their names. -def extractFastqFromDir(pattern) { - def fastq = Channel.create() - // a temporary channel does all the work - Channel - .fromPath(pattern, type: 'dir') - .ifEmpty { error "No directories found matching pattern '${pattern}'" } - .subscribe onNext: { sampleDir -> - // the last name of the sampleDir is assumed to be a unique sample id - sampleId = sampleDir.getFileName().toString() - - for (path1 in file("${sampleDir}/**_R1_*.fastq.gz")) { - assert path1.getName().contains('_R1_') - path2 = file(path1.toString().replace('_R1_', '_R2_')) - if (!path2.exists()) error "Path '${path2}' not found" - (flowcell, lane) = flowcellLaneFromFastq(path1) - patient = sampleId - gender = 'ZZ' // unused - status = 0 // normal (not tumor) - rgId = "${flowcell}.${sampleId}.${lane}" - result = [patient, gender, status, sampleId, rgId, path1, path2] - fastq.bind(result) - } - }, onComplete: { fastq.close() } - fastq -} - -// Extract gender and status from Channel -def extractInfos(channel) { - def genderMap = [:] - def statusMap = [:] - channel = channel.map{ it -> - def idPatient = it[0] - def gender = it[1] - def status = it[2] - def idSample = it[3] - genderMap[idPatient] = gender - statusMap[idPatient, idSample] = status - [idPatient] + it[3..-1] - } - [genderMap, statusMap, channel] -} - -// Channeling the TSV file containing FASTQ or BAM -// Format is: "subject gender status sample lane fastq1 fastq2" -// or: "subject gender status sample lane bam" -def extractFastq(tsvFile) { - Channel.from(tsvFile) - .splitCsv(sep: '\t') - .map { row -> - def idPatient = row[0] - def gender = row[1] - def status = returnStatus(row[2].toInteger()) - def idSample = row[3] - def idRun = row[4] - def file1 = returnFile(row[5]) - def file2 = "null" - if (hasExtension(file1, "fastq.gz") || hasExtension(file1, "fq.gz") || hasExtension(file1, "fastq") || hasExtension(file1, "fq")) { - checkNumberOfItem(row, 7) - file2 = returnFile(row[6]) - if (!hasExtension(file2, "fastq.gz") && !hasExtension(file2, "fq.gz") && !hasExtension(file2, "fastq") && !hasExtension(file2, "fq")) exit 1, "File: ${file2} has the wrong extension. See --help for more information" - if (hasExtension(file1, "fastq") || hasExtension(file1, "fq") || hasExtension(file2, "fastq") || hasExtension(file2, "fq")) { - exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint." - } - } - else if (hasExtension(file1, "bam")) checkNumberOfItem(row, 6) - else "No recognisable extention for input file: ${file1}" - - [idPatient, gender, status, idSample, idRun, file1, file2] - } -} - -// Channeling the TSV file containing mpileup -// Format is: "subject gender status sample pileup" -def extractPileup(tsvFile) { - Channel.from(tsvFile) - .splitCsv(sep: '\t') - .map { row -> - checkNumberOfItem(row, 5) - def idPatient = row[0] - def gender = row[1] - def status = returnStatus(row[2].toInteger()) - def idSample = row[3] - def mpileup = returnFile(row[4]) - - if (!hasExtension(mpileup, "pileup")) exit 1, "File: ${mpileup} has the wrong extension. See --help for more information" - - return [idPatient, gender, status, idSample, mpileup] - } -} - -// Channeling the TSV file containing Recalibration Tables. -// Format is: "subject gender status sample bam bai recalTable" -def extractRecal(tsvFile) { - Channel.from(tsvFile) - .splitCsv(sep: '\t') - .map { row -> - checkNumberOfItem(row, 7) - def idPatient = row[0] - def gender = row[1] - def status = returnStatus(row[2].toInteger()) - def idSample = row[3] - def bamFile = returnFile(row[4]) - def baiFile = returnFile(row[5]) - def recalTable = returnFile(row[6]) - - if (!hasExtension(bamFile, "bam")) exit 1, "File: ${bamFile} has the wrong extension. See --help for more information" - if (!hasExtension(baiFile, "bai")) exit 1, "File: ${baiFile} has the wrong extension. See --help for more information" - if (!hasExtension(recalTable, "recal.table")) exit 1, "File: ${recalTable} has the wrong extension. See --help for more information" - - [idPatient, gender, status, idSample, bamFile, baiFile, recalTable] - } -} - -// Parse first line of a FASTQ file, return the flowcell id and lane number. -def flowcellLaneFromFastq(path) { - // expected format: - // xx:yy:FLOWCELLID:LANE:... (seven fields) - // or - // FLOWCELLID:LANE:xx:... (five fields) - InputStream fileStream = new FileInputStream(path.toFile()) - InputStream gzipStream = new java.util.zip.GZIPInputStream(fileStream) - Reader decoder = new InputStreamReader(gzipStream, 'ASCII') - BufferedReader buffered = new BufferedReader(decoder) - def line = buffered.readLine() - assert line.startsWith('@') - line = line.substring(1) - def fields = line.split(' ')[0].split(':') - String fcid - int lane - if (fields.size() == 7) { - // CASAVA 1.8+ format - fcid = fields[2] - lane = fields[3].toInteger() - } else if (fields.size() == 5) { - fcid = fields[0] - lane = fields[1].toInteger() - } - [fcid, lane] -} - -// Check file extension -def hasExtension(it, extension) { - it.toString().toLowerCase().endsWith(extension.toLowerCase()) -} - -// Return file if it exists -def returnFile(it) { - if (!file(it).exists()) exit 1, "Missing file in TSV file: ${it}, see --help for more information" - return file(it) -} - -// Remove .ann .gz and .vcf extension from a VCF file -def reduceVCF(file) { - return file.fileName.toString().minus(".ann").minus(".vcf").minus(".gz") -} - -// Return status [0,1] -// 0 == Normal, 1 == Tumor -def returnStatus(it) { - if (!(it in [0, 1])) exit 1, "Status is not recognized in TSV file: ${it}, see --help for more information" - return it -} +// // Creating a TSV file to restart from this step +// tsv_mpileup.map { idPatient, idSample -> +// gender = genderMap[idPatient] +// status = statusMap[idPatient, idSample] +// mpileup = "${params.outdir}/VariantCalling/${idSample}/Control-FREEC/${idSample}.pileup" +// "${idPatient}\t${gender}\t${status}\t${idSample}\t${mpileup}\n" +// }.collectFile( +// name: 'control-freec_mpileup.tsv', sort: true, storeDir: "${params.outdir}/VariantCalling/TSV" +// ) + +// tsv_mpileup_sample +// .collectFile(storeDir: "${params.outdir}/VariantCalling/TSV") { +// idPatient, idSample -> +// status = statusMap[idPatient, idSample] +// gender = genderMap[idPatient] +// mpileup = "${params.outdir}/VariantCalling/${idSample}/Control-FREEC/${idSample}.pileup" +// ["control-freec_mpileup_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${mpileup}\n"] +// } + +// if (!params.no_intervals) { +// mpileupMerge = mpileupMerge.groupTuple(by:[0, 1]) +// mpileupNoInt = Channel.empty() +// } else { +// (mpileupMerge, mpileupNoInt) = mpileupMerge.into(2) +// mpileupMerge.close() +// } + +// // STEP MPILEUP.2 - MERGE +// process MergeMpileup { +// label 'cpus_1' + +// tag "${idSample}" + +// publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { it == "${idSample}.pileup" ? "VariantCalling/${idSample}/Control-FREEC/${it}" : null } + +// input: +// set idPatient, idSample, file(mpileup) from mpileupMerge + +// output: +// set idPatient, idSample, file("${idSample}.pileup") into mpileupOut + +// when: !(params.no_intervals) && 'controlfreec' in tools || 'mpileup' in tools + +// script: +// """ +// for i in `ls -1v *.pileup`; +// do cat \$i >> ${idSample}.pileup +// done +// """ +// } + +// mpileupOut = mpileupOut.mix(mpileupNoInt) +// mpileupOut = mpileupOut.dump(tag:'mpileup') + +// mpileupOutNormal = Channel.create() +// mpileupOutTumor = Channel.create() + +// if (step == 'controlfreec') mpileupOut = inputSample + +// mpileupOut +// .choice(mpileupOutTumor, mpileupOutNormal) {statusMap[it[0], it[1]] == 0 ? 1 : 0} + +// mpileupOut = mpileupOutNormal.combine(mpileupOutTumor, by:0) + +// mpileupOut = mpileupOut.map { +// idPatientNormal, idSampleNormal, mpileupOutNormal, +// idSampleTumor, mpileupOutTumor -> +// [idPatientNormal, idSampleNormal, idSampleTumor, mpileupOutNormal, mpileupOutTumor] +// } + +// // STEP CONTROLFREEC.1 - CONTROLFREEC + +// process ControlFREEC { +// label 'cpus_max' +// //label 'memory_singleCPU_2_task' + +// tag "${idSampleTumor}_vs_${idSampleNormal}" + +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/Control-FREEC", mode: params.publish_dir_mode + +// input: +// set idPatient, idSampleNormal, idSampleTumor, file(mpileupNormal), file(mpileupTumor) from mpileupOut +// file(chrDir) from ch_chr_dir +// file(mappability) from ch_mappability +// file(chrLength) from ch_chr_length +// file(dbsnp) from ch_dbsnp +// file(dbsnpIndex) from ch_dbsnp_tbi +// file(fasta) from ch_fasta +// file(fastaFai) from ch_fai + +// output: +// set idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}.pileup_CNVs"), file("${idSampleTumor}.pileup_ratio.txt"), file("${idSampleTumor}.pileup_normal_CNVs"), file("${idSampleTumor}.pileup_normal_ratio.txt"), file("${idSampleTumor}.pileup_BAF.txt"), file("${idSampleNormal}.pileup_BAF.txt") into controlFreecViz +// set file("*.pileup*"), file("${idSampleTumor}_vs_${idSampleNormal}.config.txt") into controlFreecOut + +// when: 'controlfreec' in tools + +// script: +// config = "${idSampleTumor}_vs_${idSampleNormal}.config.txt" +// gender = genderMap[idPatient] +// // if we are using coefficientOfVariation, we must delete the window parameter +// // it is "window = 20000" in the default settings, without coefficientOfVariation set, +// // but we do not like it. Note, it is not written in stone +// coeff_or_window = params.cf_window ? "window = ${params.cf_window}" : "coefficientOfVariation = ${params.cf_coeff}" + +// """ +// touch ${config} +// echo "[general]" >> ${config} +// echo "BedGraphOutput = TRUE" >> ${config} +// echo "chrFiles = \${PWD}/${chrDir.fileName}" >> ${config} +// echo "chrLenFile = \${PWD}/${chrLength.fileName}" >> ${config} +// echo "gemMappabilityFile = \${PWD}/${mappability}" >> ${config} +// echo "${coeff_or_window}" >> ${config} +// echo "contaminationAdjustment = TRUE" >> ${config} +// echo "forceGCcontentNormalization = 1" >> ${config} +// echo "maxThreads = ${task.cpus}" >> ${config} +// echo "minimalSubclonePresence = 20" >> ${config} +// echo "ploidy = ${params.cf_ploidy}" >> ${config} +// echo "sex = ${gender}" >> ${config} +// echo "" >> ${config} + +// echo "[control]" >> ${config} +// echo "inputFormat = pileup" >> ${config} +// echo "mateFile = \${PWD}/${mpileupNormal}" >> ${config} +// echo "mateOrientation = FR" >> ${config} +// echo "" >> ${config} + +// echo "[sample]" >> ${config} +// echo "inputFormat = pileup" >> ${config} +// echo "mateFile = \${PWD}/${mpileupTumor}" >> ${config} +// echo "mateOrientation = FR" >> ${config} +// echo "" >> ${config} + +// echo "[BAF]" >> ${config} +// echo "SNPfile = ${dbsnp.fileName}" >> ${config} + +// freec -conf ${config} +// """ +// } + +// controlFreecOut.dump(tag:'ControlFREEC') + +// // STEP CONTROLFREEC.3 - VISUALIZATION + +// process ControlFreecViz { +// label 'memory_singleCPU_2_task' + +// tag "${idSampleTumor}_vs_${idSampleNormal}" + +// publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/Control-FREEC", mode: params.publish_dir_mode + +// input: +// set idPatient, idSampleNormal, idSampleTumor, file(cnvTumor), file(ratioTumor), file(cnvNormal), file(ratioNormal), file(bafTumor), file(bafNormal) from controlFreecViz + +// output: +// set file("*.txt"), file("*.png"), file("*.bed") into controlFreecVizOut + +// when: 'controlfreec' in tools + +// """ +// echo "Shaping CNV files to make sure we can assess significance" +// awk 'NF==9{print}' ${cnvTumor} > TUMOR.CNVs +// awk 'NF==7{print}' ${cnvNormal} > NORMAL.CNVs + +// echo "############### Calculating significance values for TUMOR CNVs #############" +// cat /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/assess_significance.R | R --slave --args TUMOR.CNVs ${ratioTumor} + +// echo "############### Calculating significance values for NORMAL CNVs ############" +// cat /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/assess_significance.R | R --slave --args NORMAL.CNVs ${ratioNormal} + +// echo "############### Creating graph for TUMOR ratios ###############" +// cat /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/makeGraph.R | R --slave --args 2 ${ratioTumor} ${bafTumor} + +// echo "############### Creating graph for NORMAL ratios ##############" +// cat /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/makeGraph.R | R --slave --args 2 ${ratioNormal} ${bafNormal} + +// echo "############### Creating BED files for TUMOR ##############" +// perl /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/freec2bed.pl -f ${ratioTumor} > ${idSampleTumor}.bed + +// echo "############### Creating BED files for NORMAL #############" +// perl /opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/freec2bed.pl -f ${ratioNormal} > ${idSampleNormal}.bed +// """ +// } + +// controlFreecVizOut.dump(tag:'ControlFreecViz') + +// // Remapping channels for QC and annotation + +// (vcfStrelkaIndels, vcfStrelkaSNVS) = vcfStrelka.into(2) +// (vcfStrelkaBPIndels, vcfStrelkaBPSNVS) = vcfStrelkaBP.into(2) +// (vcfMantaSomaticSV, vcfMantaDiploidSV) = vcfManta.into(2) + +// vcfKeep = Channel.empty().mix( +// filteredMutect2Output.map{ +// variantCaller, idPatient, idSample, vcf, tbi, tsv -> +// [variantcaller, idSample, vcf] +// }, +// vcfConcatenated.map{ +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf] +// }, +// vcf_sentieon_compressed.map { +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf] +// }, +// vcfStrelkaSingle.map { +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf[1]] +// }, +// vcfMantaSingle.map { +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf[2]] +// }, +// vcfMantaDiploidSV.map { +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf[2]] +// }, +// vcfMantaSomaticSV.map { +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf[3]] +// }, +// vcfStrelkaIndels.map { +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf[0]] +// }, +// vcfStrelkaSNVS.map { +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf[1]] +// }, +// vcfStrelkaBPIndels.map { +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf[0]] +// }, +// vcfStrelkaBPSNVS.map { +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf[1]] +// }, +// vcfTIDDIT.map { +// variantcaller, idPatient, idSample, vcf, tbi -> +// [variantcaller, idSample, vcf] +// }) + +// (vcfBCFtools, vcfVCFtools, vcfAnnotation) = vcfKeep.into(3) + +// // STEP VCF.QC + +// process BcftoolsStats { +// label 'cpus_1' + +// tag "${variantCaller} - ${vcf}" + +// publishDir "${params.outdir}/Reports/${idSample}/BCFToolsStats", mode: params.publish_dir_mode + +// input: +// set variantCaller, idSample, file(vcf) from vcfBCFtools + +// output: +// file ("*.bcf.tools.stats.out") into bcftoolsReport + +// when: !('bcftools' in skipQC) + +// script: +// """ +// bcftools stats ${vcf} > ${reduceVCF(vcf.fileName)}.bcf.tools.stats.out +// """ +// } + +// bcftoolsReport = bcftoolsReport.dump(tag:'BCFTools') + +// process Vcftools { +// label 'cpus_1' + +// tag "${variantCaller} - ${vcf}" + +// publishDir "${params.outdir}/Reports/${idSample}/VCFTools", mode: params.publish_dir_mode + +// input: +// set variantCaller, idSample, file(vcf) from vcfVCFtools + +// output: +// file ("${reduceVCF(vcf.fileName)}.*") into vcftoolsReport + +// when: !('vcftools' in skipQC) + +// script: +// """ +// vcftools \ +// --gzvcf ${vcf} \ +// --TsTv-by-count \ +// --out ${reduceVCF(vcf.fileName)} + +// vcftools \ +// --gzvcf ${vcf} \ +// --TsTv-by-qual \ +// --out ${reduceVCF(vcf.fileName)} + +// vcftools \ +// --gzvcf ${vcf} \ +// --FILTER-summary \ +// --out ${reduceVCF(vcf.fileName)} +// """ +// } + +// vcftoolsReport = vcftoolsReport.dump(tag:'VCFTools') + +// /* +// ================================================================================ +// ANNOTATION +// ================================================================================ +// */ + +// if (step == 'annotate') { +// vcfToAnnotate = Channel.create() +// vcfNoAnnotate = Channel.create() + +// if (tsvPath == []) { +// // Sarek, by default, annotates all available vcfs that it can find in the VariantCalling directory +// // Excluding vcfs from FreeBayes, and g.vcf from HaplotypeCaller +// // Basically it's: results/VariantCalling/*/{HaplotypeCaller,Manta,Mutect2,SentieonDNAseq,SentieonDNAscope,SentieonTNscope,Strelka,TIDDIT}/*.vcf.gz +// // Without *SmallIndels.vcf.gz from Manta, and *.genome.vcf.gz from Strelka +// // The small snippet `vcf.minus(vcf.fileName)[-2]` catches idSample +// // This field is used to output final annotated VCFs in the correct directory +// Channel.empty().mix( +// Channel.fromPath("${params.outdir}/VariantCalling/*/HaplotypeCaller/*.vcf.gz") +// .flatten().map{vcf -> ['HaplotypeCaller', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, +// Channel.fromPath("${params.outdir}/VariantCalling/*/Manta/*[!candidate]SV.vcf.gz") +// .flatten().map{vcf -> ['Manta', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, +// Channel.fromPath("${params.outdir}/VariantCalling/*/Mutect2/*.vcf.gz") +// .flatten().map{vcf -> ['Mutect2', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, +// Channel.fromPath("${params.outdir}/VariantCalling/*/SentieonDNAseq/*.vcf.gz") +// .flatten().map{vcf -> ['SentieonDNAseq', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, +// Channel.fromPath("${params.outdir}/VariantCalling/*/SentieonDNAscope/*.vcf.gz") +// .flatten().map{vcf -> ['SentieonDNAscope', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, +// Channel.fromPath("${params.outdir}/VariantCalling/*/SentieonTNscope/*.vcf.gz") +// .flatten().map{vcf -> ['SentieonTNscope', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, +// Channel.fromPath("${params.outdir}/VariantCalling/*/Strelka/*{somatic,variant}*.vcf.gz") +// .flatten().map{vcf -> ['Strelka', vcf.minus(vcf.fileName)[-2].toString(), vcf]}, +// Channel.fromPath("${params.outdir}/VariantCalling/*/TIDDIT/*.vcf.gz") +// .flatten().map{vcf -> ['TIDDIT', vcf.minus(vcf.fileName)[-2].toString(), vcf]} +// ).choice(vcfToAnnotate, vcfNoAnnotate) { +// annotateTools == [] || (annotateTools != [] && it[0] in annotateTools) ? 0 : 1 +// } +// } else if (annotateTools == []) { +// // Annotate user-submitted VCFs +// // If user-submitted, Sarek assume that the idSample should be assumed automatically +// vcfToAnnotate = Channel.fromPath(tsvPath) +// .map{vcf -> ['userspecified', vcf.minus(vcf.fileName)[-2].toString(), vcf]} +// } else exit 1, "specify only tools or files to annotate, not both" + +// vcfNoAnnotate.close() +// vcfAnnotation = vcfAnnotation.mix(vcfToAnnotate) +// } + +// // as now have the list of VCFs to annotate, the first step is to annotate with allele frequencies, if there are any + +// (vcfSnpeff, vcfVep) = vcfAnnotation.into(2) + +// vcfVep = vcfVep.map { +// variantCaller, idSample, vcf -> +// [variantCaller, idSample, vcf, null] +// } + +// // STEP SNPEFF + +// process Snpeff { +// tag "${idSample} - ${variantCaller} - ${vcf}" + +// publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { +// if (it == "${reducedVCF}_snpEff.ann.vcf") null +// else "Reports/${idSample}/snpEff/${it}" +// } + +// input: +// set variantCaller, idSample, file(vcf) from vcfSnpeff +// file(dataDir) from ch_snpeff_cache +// val snpeffDb from ch_snpeff_db + +// output: +// set file("${reducedVCF}_snpEff.genes.txt"), file("${reducedVCF}_snpEff.html"), file("${reducedVCF}_snpEff.csv") into snpeffReport +// set variantCaller, idSample, file("${reducedVCF}_snpEff.ann.vcf") into snpeffVCF + +// when: 'snpeff' in tools || 'merge' in tools + +// script: +// reducedVCF = reduceVCF(vcf.fileName) +// cache = (params.snpeff_cache && params.annotation_cache) ? "-dataDir \${PWD}/${dataDir}" : "" +// """ +// snpEff -Xmx${task.memory.toGiga()}g \ +// ${snpeffDb} \ +// -csvStats ${reducedVCF}_snpEff.csv \ +// -nodownload \ +// ${cache} \ +// -canon \ +// -v \ +// ${vcf} \ +// > ${reducedVCF}_snpEff.ann.vcf + +// mv snpEff_summary.html ${reducedVCF}_snpEff.html +// """ +// } + +// snpeffReport = snpeffReport.dump(tag:'snpEff report') + +// // STEP COMPRESS AND INDEX VCF.1 - SNPEFF + +// process CompressVCFsnpEff { +// tag "${idSample} - ${vcf}" + +// publishDir "${params.outdir}/Annotation/${idSample}/snpEff", mode: params.publish_dir_mode + +// input: +// set variantCaller, idSample, file(vcf) from snpeffVCF + +// output: +// set variantCaller, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into (compressVCFsnpEffOut) + +// script: +// """ +// bgzip < ${vcf} > ${vcf}.gz +// tabix ${vcf}.gz +// """ +// } + +// compressVCFsnpEffOut = compressVCFsnpEffOut.dump(tag:'VCF') + +// // STEP VEP.1 + +// process VEP { +// label 'VEP' +// label 'cpus_4' + +// tag "${idSample} - ${variantCaller} - ${vcf}" + +// publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { +// if (it == "${reducedVCF}_VEP.summary.html") "Reports/${idSample}/VEP/${it}" +// else null +// } + +// input: +// set variantCaller, idSample, file(vcf), file(idx) from vcfVep +// file(dataDir) from ch_vep_cache +// val cache_version from ch_vep_cache_version +// file(cadd_InDels) from ch_cadd_indels +// file(cadd_InDels_tbi) from ch_cadd_indels_tbi +// file(cadd_WG_SNVs) from ch_cadd_wg_snvs +// file(cadd_WG_SNVs_tbi) from ch_cadd_wg_snvs_tbi + +// output: +// set variantCaller, idSample, file("${reducedVCF}_VEP.ann.vcf") into vepVCF +// file("${reducedVCF}_VEP.summary.html") into vepReport + +// when: 'vep' in tools + +// script: +// reducedVCF = reduceVCF(vcf.fileName) +// genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome + +// dir_cache = (params.vep_cache && params.annotation_cache) ? " \${PWD}/${dataDir}" : "/.vep" +// cadd = (params.cadd_cache && params.cadd_wg_snvs && params.cadd_indels) ? "--plugin CADD,whole_genome_SNVs.tsv.gz,InDels.tsv.gz" : "" +// genesplicer = params.genesplicer ? "--plugin GeneSplicer,/opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/genesplicer,/opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/share/genesplicer-1.0-1/human,context=200,tmpdir=\$PWD/${reducedVCF}" : "--offline" +// """ +// mkdir ${reducedVCF} + +// vep \ +// -i ${vcf} \ +// -o ${reducedVCF}_VEP.ann.vcf \ +// --assembly ${genome} \ +// --species ${params.species} \ +// ${cadd} \ +// ${genesplicer} \ +// --cache \ +// --cache_version ${cache_version} \ +// --dir_cache ${dir_cache} \ +// --everything \ +// --filter_common \ +// --fork ${task.cpus} \ +// --format vcf \ +// --per_gene \ +// --stats_file ${reducedVCF}_VEP.summary.html \ +// --total_length \ +// --vcf + +// rm -rf ${reducedVCF} +// """ +// } + +// vepReport = vepReport.dump(tag:'VEP') + +// // STEP VEP.2 - VEP AFTER SNPEFF + +// process VEPmerge { +// label 'VEP' +// label 'cpus_4' + +// tag "${idSample} - ${variantCaller} - ${vcf}" + +// publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { +// if (it == "${reducedVCF}_VEP.summary.html") "Reports/${idSample}/VEP/${it}" +// else null +// } + +// input: +// set variantCaller, idSample, file(vcf), file(idx) from compressVCFsnpEffOut +// file(dataDir) from ch_vep_cache +// val cache_version from ch_vep_cache_version +// file(cadd_InDels) from ch_cadd_indels +// file(cadd_InDels_tbi) from ch_cadd_indels_tbi +// file(cadd_WG_SNVs) from ch_cadd_wg_snvs +// file(cadd_WG_SNVs_tbi) from ch_cadd_wg_snvs_tbi + +// output: +// set variantCaller, idSample, file("${reducedVCF}_VEP.ann.vcf") into vepVCFmerge +// file("${reducedVCF}_VEP.summary.html") into vepReportMerge + +// when: 'merge' in tools + +// script: +// reducedVCF = reduceVCF(vcf.fileName) +// genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome +// dir_cache = (params.vep_cache && params.annotation_cache) ? " \${PWD}/${dataDir}" : "/.vep" +// cadd = (params.cadd_cache && params.cadd_wg_snvs && params.cadd_indels) ? "--plugin CADD,whole_genome_SNVs.tsv.gz,InDels.tsv.gz" : "" +// genesplicer = params.genesplicer ? "--plugin GeneSplicer,/opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/bin/genesplicer,/opt/conda/envs/nf-core-sarek-${workflow.manifest.version}/share/genesplicer-1.0-1/human,context=200,tmpdir=\$PWD/${reducedVCF}" : "--offline" +// """ +// mkdir ${reducedVCF} + +// vep \ +// -i ${vcf} \ +// -o ${reducedVCF}_VEP.ann.vcf \ +// --assembly ${genome} \ +// --species ${params.species} \ +// ${cadd} \ +// ${genesplicer} \ +// --cache \ +// --cache_version ${cache_version} \ +// --dir_cache ${dir_cache} \ +// --everything \ +// --filter_common \ +// --fork ${task.cpus} \ +// --format vcf \ +// --per_gene \ +// --stats_file ${reducedVCF}_VEP.summary.html \ +// --total_length \ +// --vcf + +// rm -rf ${reducedVCF} +// """ +// } + +// vepReportMerge = vepReportMerge.dump(tag:'VEP') + +// vcfCompressVCFvep = vepVCF.mix(vepVCFmerge) + +// // STEP COMPRESS AND INDEX VCF.2 - VEP + +// process CompressVCFvep { +// tag "${idSample} - ${vcf}" + +// publishDir "${params.outdir}/Annotation/${idSample}/VEP", mode: params.publish_dir_mode + +// input: +// set variantCaller, idSample, file(vcf) from vcfCompressVCFvep + +// output: +// set variantCaller, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into compressVCFOutVEP + +// script: +// """ +// bgzip < ${vcf} > ${vcf}.gz +// tabix ${vcf}.gz +// """ +// } + +// compressVCFOutVEP = compressVCFOutVEP.dump(tag:'VCF') + +// /* +// ================================================================================ +// MultiQC +// ================================================================================ +// */ + +// // STEP MULTIQC + +// process MultiQC { +// publishDir "${params.outdir}/Reports/MultiQC", mode: params.publish_dir_mode + +// input: +// file (multiqcConfig) from ch_multiqc_config +// file (mqc_custom_config) from ch_multiqc_custom_config.collect().ifEmpty([]) +// file (versions) from ch_software_versions_yaml.collect() +// file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml") +// file ('bamQC/*') from bamQCReport.collect().ifEmpty([]) +// file ('BCFToolsStats/*') from bcftoolsReport.collect().ifEmpty([]) +// file ('FastQC/*') from fastQCReport.collect().ifEmpty([]) +// file ('TrimmedFastQC/*') from trimGaloreReport.collect().ifEmpty([]) +// file ('MarkDuplicates/*') from duplicates_marked_report.collect().ifEmpty([]) +// file ('DuplicatesMarked/*.recal.table') from baseRecalibratorReport.collect().ifEmpty([]) +// file ('SamToolsStats/*') from samtoolsStatsReport.collect().ifEmpty([]) +// file ('snpEff/*') from snpeffReport.collect().ifEmpty([]) +// file ('VCFTools/*') from vcftoolsReport.collect().ifEmpty([]) + +// output: +// file "*multiqc_report.html" into ch_multiqc_report +// file "*_data" +// file "multiqc_plots" + +// when: !('multiqc' in skipQC) + +// script: +// rtitle = custom_runName ? "--title \"$custom_runName\"" : '' +// rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' +// custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : '' +// """ +// multiqc -f ${rtitle} ${rfilename} ${custom_config_file} . +// """ +// } + +// ch_multiqc_report.dump(tag:'MultiQC') + +// // Output Description HTML +// process Output_documentation { +// publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode + +// input: +// file output_docs from ch_output_docs + +// output: +// file "results_description.html" + +// when: !('documentation' in skipQC) + +// script: +// """ +// markdown_to_html.py $output_docs -o results_description.html +// """ +// } + +// // Completion e-mail notification +// workflow.onComplete { + +// // Set up the e-mail variables +// def subject = "[nf-core/sarek] Successful: $workflow.runName" +// if (!workflow.success) { +// subject = "[nf-core/sarek] FAILED: $workflow.runName" +// } +// def email_fields = [:] +// email_fields['version'] = workflow.manifest.version +// email_fields['runName'] = custom_runName ?: workflow.runName +// email_fields['success'] = workflow.success +// email_fields['dateComplete'] = workflow.complete +// email_fields['duration'] = workflow.duration +// email_fields['exitStatus'] = workflow.exitStatus +// email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') +// email_fields['errorReport'] = (workflow.errorReport ?: 'None') +// email_fields['commandLine'] = workflow.commandLine +// email_fields['projectDir'] = workflow.projectDir +// email_fields['summary'] = summary +// email_fields['summary']['Date Started'] = workflow.start +// email_fields['summary']['Date Completed'] = workflow.complete +// email_fields['summary']['Pipeline script file path'] = workflow.scriptFile +// email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId +// if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository +// if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId +// if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision +// email_fields['summary']['Nextflow Version'] = workflow.nextflow.version +// email_fields['summary']['Nextflow Build'] = workflow.nextflow.build +// email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp + +// def mqc_report = null +// try { +// if (workflow.success) { +// mqc_report = ch_multiqc_report.getVal() +// if (mqc_report.getClass() == ArrayList) { +// log.warn "[nf-core/sarek] Found multiple reports from process 'multiqc', will use only one" +// mqc_report = mqc_report[0] +// } +// } +// } catch (all) { +// log.warn "[nf-core/sarek] Could not attach MultiQC report to summary email" +// } + +// // Check if we are only sending emails on failure +// email_address = params.email +// if (!params.email && params.email_on_fail && !workflow.success) { +// email_address = params.email_on_fail +// } + +// // Render the TXT template +// def engine = new groovy.text.GStringTemplateEngine() +// def tf = new File("$baseDir/assets/email_template.txt") +// def txt_template = engine.createTemplate(tf).make(email_fields) +// def email_txt = txt_template.toString() + +// // Render the HTML template +// def hf = new File("$baseDir/assets/email_template.html") +// def html_template = engine.createTemplate(hf).make(email_fields) +// def email_html = html_template.toString() + +// // Render the sendmail template +// def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$baseDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] +// def sf = new File("$baseDir/assets/sendmail_template.txt") +// def sendmail_template = engine.createTemplate(sf).make(smail_fields) +// def sendmail_html = sendmail_template.toString() + +// // Send the HTML e-mail +// if (email_address) { +// try { +// if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } +// // Try to send HTML e-mail using sendmail +// [ 'sendmail', '-t' ].execute() << sendmail_html +// log.info "[nf-core/sarek] Sent summary e-mail to $email_address (sendmail)" +// } catch (all) { +// // Catch failures and try with plaintext +// [ 'mail', '-s', subject, email_address ].execute() << email_txt +// log.info "[nf-core/sarek] Sent summary e-mail to $email_address (mail)" +// } +// } + +// // Write summary e-mail HTML to a file +// def output_d = new File("${params.outdir}/pipeline_info/") +// if (!output_d.exists()) { +// output_d.mkdirs() +// } +// def output_hf = new File(output_d, "pipeline_report.html") +// output_hf.withWriter { w -> w << email_html } +// def output_tf = new File(output_d, "pipeline_report.txt") +// output_tf.withWriter { w -> w << email_txt } + +// c_green = params.monochrome_logs ? '' : "\033[0;32m"; +// c_purple = params.monochrome_logs ? '' : "\033[0;35m"; +// c_red = params.monochrome_logs ? '' : "\033[0;31m"; +// c_reset = params.monochrome_logs ? '' : "\033[0m"; + +// if (workflow.stats.ignoredCount > 0 && workflow.success) { +// log.info "-${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}-" +// log.info "-${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}-" +// log.info "-${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}-" +// } + +// if (workflow.success) { +// log.info "-${c_purple}[nf-core/sarek]${c_green} Pipeline completed successfully${c_reset}-" +// } else { +// checkHostname() +// log.info "-${c_purple}[nf-core/sarek]${c_red} Pipeline completed with errors${c_reset}-" +// } +// } + +// /* +// ================================================================================ +// nf-core functions +// ================================================================================ +// */ + +// def create_workflow_summary(summary) { +// def yaml_file = workDir.resolve('workflow_summary_mqc.yaml') +// yaml_file.text = """ +// id: 'nf-core-sarek-summary' +// description: " - this information is collected when the pipeline is started." +// section_name: 'nf-core/sarek Workflow Summary' +// section_href: 'https://github.com/nf-core/sarek' +// plot_type: 'html' +// data: | +//
+// ${summary.collect { k, v -> "
$k
${v ?: 'N/A'}
" }.join("\n")} +//
+// """.stripIndent() + +// return yaml_file +// } + +// def nfcoreHeader() { +// // Log colors ANSI codes +// c_black = params.monochrome_logs ? '' : "\033[0;30m"; +// c_blue = params.monochrome_logs ? '' : "\033[0;34m"; +// c_dim = params.monochrome_logs ? '' : "\033[2m"; +// c_green = params.monochrome_logs ? '' : "\033[0;32m"; +// c_purple = params.monochrome_logs ? '' : "\033[0;35m"; +// c_reset = params.monochrome_logs ? '' : "\033[0m"; +// c_white = params.monochrome_logs ? '' : "\033[0;37m"; +// c_yellow = params.monochrome_logs ? '' : "\033[0;33m"; + +// return """ -${c_dim}--------------------------------------------------${c_reset}- +// ${c_green},--.${c_black}/${c_green},-.${c_reset} +// ${c_blue} ___ __ __ __ ___ ${c_green}/,-._.--~\'${c_reset} +// ${c_blue} |\\ | |__ __ / ` / \\ |__) |__ ${c_yellow}} {${c_reset} +// ${c_blue} | \\| | \\__, \\__/ | \\ |___ ${c_green}\\`-._,-`-,${c_reset} +// ${c_green}`._,._,\'${c_reset} +// ${c_white}____${c_reset} +// ${c_white}.´ _ `.${c_reset} +// ${c_white}/ ${c_green}|\\${c_reset}`-_ \\${c_reset} ${c_blue} __ __ ___ ${c_reset} +// ${c_white}| ${c_green}| \\${c_reset} `-|${c_reset} ${c_blue}|__` /\\ |__) |__ |__/${c_reset} +// ${c_white}\\ ${c_green}| \\${c_reset} /${c_reset} ${c_blue}.__| /¯¯\\ | \\ |___ | \\${c_reset} +// ${c_white}`${c_green}|${c_reset}____${c_green}\\${c_reset}´${c_reset} + +// ${c_purple} nf-core/sarek v${workflow.manifest.version}${c_reset} +// -${c_dim}--------------------------------------------------${c_reset}- +// """.stripIndent() +// } + +// def checkHostname() { +// def c_reset = params.monochrome_logs ? '' : "\033[0m" +// def c_white = params.monochrome_logs ? '' : "\033[0;37m" +// def c_red = params.monochrome_logs ? '' : "\033[1;91m" +// def c_yellow_bold = params.monochrome_logs ? '' : "\033[1;93m" +// if (params.hostnames) { +// def hostname = "hostname".execute().text.trim() +// params.hostnames.each { prof, hnames -> +// hnames.each { hname -> +// if (hostname.contains(hname) && !workflow.profile.contains(prof)) { +// log.error "====================================================\n" + +// " ${c_red}WARNING!${c_reset} You are running with `-profile $workflow.profile`\n" + +// " but your machine hostname is ${c_white}'$hostname'${c_reset}\n" + +// " ${c_yellow_bold}It's highly recommended that you use `-profile $prof${c_reset}`\n" + +// "============================================================" +// } +// } +// } +// } +// } + +// /* +// ================================================================================ +// sarek functions +// ================================================================================ +// */ + +// // Check if a row has the expected number of item +// def checkNumberOfItem(row, number) { +// if (row.size() != number) exit 1, "Malformed row in TSV file: ${row}, see --help for more information" +// return true +// } + +// // Check parameter existence +// def checkParameterExistence(it, list) { +// if (!list.contains(it)) { +// log.warn "Unknown parameter: ${it}" +// return false +// } +// return true +// } + +// // Compare each parameter with a list of parameters +// def checkParameterList(list, realList) { +// return list.every{ checkParameterExistence(it, realList) } +// } + +// // Define list of available tools to annotate +// def defineAnnoList() { +// return [ +// 'haplotypecaller', +// 'manta', +// 'mutect2', +// 'strelka', +// 'tiddit' +// ] +// } + +// // Define list of skipable QC tools +// def defineSkipQClist() { +// return [ +// 'bamqc', +// 'baserecalibrator', +// 'bcftools', +// 'documentation', +// 'fastqc', +// 'markduplicates', +// 'multiqc', +// 'samtools', +// 'sentieon', +// 'vcftools', +// 'versions' +// ] +// } + +// // Define list of available step +// def defineStepList() { +// return [ +// 'annotate', +// 'controlfreec', +// 'mapping', +// 'preparerecalibration', +// 'recalibrate', +// 'variantcalling' +// ] +// } + +// // Define list of available tools +// def defineToolList() { +// return [ +// 'ascat', +// 'cnvkit', +// 'controlfreec', +// 'dnascope', +// 'dnaseq', +// 'freebayes', +// 'haplotypecaller', +// 'manta', +// 'merge', +// 'mpileup', +// 'mutect2', +// 'snpeff', +// 'strelka', +// 'tiddit', +// 'tnscope', +// 'vep', +// 'msisensor' +// ] +// } + +// // Channeling the TSV file containing BAM. +// // Format is: "subject gender status sample bam bai" +// def extractBam(tsvFile) { +// Channel.from(tsvFile) +// .splitCsv(sep: '\t') +// .map { row -> +// checkNumberOfItem(row, 6) +// def idPatient = row[0] +// def gender = row[1] +// def status = returnStatus(row[2].toInteger()) +// def idSample = row[3] +// def bamFile = returnFile(row[4]) +// def baiFile = returnFile(row[5]) + +// if (!hasExtension(bamFile, "bam")) exit 1, "File: ${bamFile} has the wrong extension. See --help for more information" +// if (!hasExtension(baiFile, "bai")) exit 1, "File: ${baiFile} has the wrong extension. See --help for more information" + +// return [idPatient, gender, status, idSample, bamFile, baiFile] +// } +// } + +// // Create a channel of germline FASTQs from a directory pattern: "my_samples/*/" +// // All FASTQ files in subdirectories are collected and emitted; +// // they must have _R1_ and _R2_ in their names. +// def extractFastqFromDir(pattern) { +// def fastq = Channel.create() +// // a temporary channel does all the work +// Channel +// .fromPath(pattern, type: 'dir') +// .ifEmpty { error "No directories found matching pattern '${pattern}'" } +// .subscribe onNext: { sampleDir -> +// // the last name of the sampleDir is assumed to be a unique sample id +// sampleId = sampleDir.getFileName().toString() + +// for (path1 in file("${sampleDir}/**_R1_*.fastq.gz")) { +// assert path1.getName().contains('_R1_') +// path2 = file(path1.toString().replace('_R1_', '_R2_')) +// if (!path2.exists()) error "Path '${path2}' not found" +// (flowcell, lane) = flowcellLaneFromFastq(path1) +// patient = sampleId +// gender = 'ZZ' // unused +// status = 0 // normal (not tumor) +// rgId = "${flowcell}.${sampleId}.${lane}" +// result = [patient, gender, status, sampleId, rgId, path1, path2] +// fastq.bind(result) +// } +// }, onComplete: { fastq.close() } +// fastq +// } + +// // Extract gender and status from Channel +// def extractInfos(channel) { +// def genderMap = [:] +// def statusMap = [:] +// channel = channel.map{ it -> +// def idPatient = it[0] +// def gender = it[1] +// def status = it[2] +// def idSample = it[3] +// genderMap[idPatient] = gender +// statusMap[idPatient, idSample] = status +// [idPatient] + it[3..-1] +// } +// [genderMap, statusMap, channel] +// } + +// // Channeling the TSV file containing FASTQ or BAM +// // Format is: "subject gender status sample lane fastq1 fastq2" +// // or: "subject gender status sample lane bam" +// def extractFastq(tsvFile) { +// Channel.from(tsvFile) +// .splitCsv(sep: '\t') +// .map { row -> +// def idPatient = row[0] +// def gender = row[1] +// def status = returnStatus(row[2].toInteger()) +// def idSample = row[3] +// def idRun = row[4] +// def file1 = returnFile(row[5]) +// def file2 = "null" +// if (hasExtension(file1, "fastq.gz") || hasExtension(file1, "fq.gz") || hasExtension(file1, "fastq") || hasExtension(file1, "fq")) { +// checkNumberOfItem(row, 7) +// file2 = returnFile(row[6]) +// if (!hasExtension(file2, "fastq.gz") && !hasExtension(file2, "fq.gz") && !hasExtension(file2, "fastq") && !hasExtension(file2, "fq")) exit 1, "File: ${file2} has the wrong extension. See --help for more information" +// if (hasExtension(file1, "fastq") || hasExtension(file1, "fq") || hasExtension(file2, "fastq") || hasExtension(file2, "fq")) { +// exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint." +// } +// } +// else if (hasExtension(file1, "bam")) checkNumberOfItem(row, 6) +// else "No recognisable extention for input file: ${file1}" + +// [idPatient, gender, status, idSample, idRun, file1, file2] +// } +// } + +// // Channeling the TSV file containing mpileup +// // Format is: "subject gender status sample pileup" +// def extractPileup(tsvFile) { +// Channel.from(tsvFile) +// .splitCsv(sep: '\t') +// .map { row -> +// checkNumberOfItem(row, 5) +// def idPatient = row[0] +// def gender = row[1] +// def status = returnStatus(row[2].toInteger()) +// def idSample = row[3] +// def mpileup = returnFile(row[4]) + +// if (!hasExtension(mpileup, "pileup")) exit 1, "File: ${mpileup} has the wrong extension. See --help for more information" + +// return [idPatient, gender, status, idSample, mpileup] +// } +// } + +// // Channeling the TSV file containing Recalibration Tables. +// // Format is: "subject gender status sample bam bai recalTable" +// def extractRecal(tsvFile) { +// Channel.from(tsvFile) +// .splitCsv(sep: '\t') +// .map { row -> +// checkNumberOfItem(row, 7) +// def idPatient = row[0] +// def gender = row[1] +// def status = returnStatus(row[2].toInteger()) +// def idSample = row[3] +// def bamFile = returnFile(row[4]) +// def baiFile = returnFile(row[5]) +// def recalTable = returnFile(row[6]) + +// if (!hasExtension(bamFile, "bam")) exit 1, "File: ${bamFile} has the wrong extension. See --help for more information" +// if (!hasExtension(baiFile, "bai")) exit 1, "File: ${baiFile} has the wrong extension. See --help for more information" +// if (!hasExtension(recalTable, "recal.table")) exit 1, "File: ${recalTable} has the wrong extension. See --help for more information" + +// [idPatient, gender, status, idSample, bamFile, baiFile, recalTable] +// } +// } + +// // Parse first line of a FASTQ file, return the flowcell id and lane number. +// def flowcellLaneFromFastq(path) { +// // expected format: +// // xx:yy:FLOWCELLID:LANE:... (seven fields) +// // or +// // FLOWCELLID:LANE:xx:... (five fields) +// InputStream fileStream = new FileInputStream(path.toFile()) +// InputStream gzipStream = new java.util.zip.GZIPInputStream(fileStream) +// Reader decoder = new InputStreamReader(gzipStream, 'ASCII') +// BufferedReader buffered = new BufferedReader(decoder) +// def line = buffered.readLine() +// assert line.startsWith('@') +// line = line.substring(1) +// def fields = line.split(' ')[0].split(':') +// String fcid +// int lane +// if (fields.size() == 7) { +// // CASAVA 1.8+ format +// fcid = fields[2] +// lane = fields[3].toInteger() +// } else if (fields.size() == 5) { +// fcid = fields[0] +// lane = fields[1].toInteger() +// } +// [fcid, lane] +// } + +// // Check file extension +// def hasExtension(it, extension) { +// it.toString().toLowerCase().endsWith(extension.toLowerCase()) +// } + +// // Return file if it exists +// def returnFile(it) { +// if (!file(it).exists()) exit 1, "Missing file in TSV file: ${it}, see --help for more information" +// return file(it) +// } + +// // Remove .ann .gz and .vcf extension from a VCF file +// def reduceVCF(file) { +// return file.fileName.toString().minus(".ann").minus(".vcf").minus(".gz") +// } + +// // Return status [0,1] +// // 0 == Normal, 1 == Tumor +// def returnStatus(it) { +// if (!(it in [0, 1])) exit 1, "Status is not recognized in TSV file: ${it}, see --help for more information" +// return it +// } From 1981679f6cd9597403ba8c96f1e120dde4773a42 Mon Sep 17 00:00:00 2001 From: ggabernet Date: Tue, 14 Jul 2020 11:14:10 +0200 Subject: [PATCH 003/200] update NXF version --- .github/workflows/ci.yml | 4 ++-- CHANGELOG.md | 4 ++++ README.md | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4aae90c0e7..477455964d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,7 +12,7 @@ jobs: strategy: matrix: # Nextflow versions: check pipeline minimum and current latest - nxf_ver: ['19.10.0', ''] + nxf_ver: ['20.04.1', ''] steps: - uses: actions/checkout@v2 - name: Install Nextflow @@ -42,7 +42,7 @@ jobs: sudo mv nextflow /usr/local/bin/ env: # Only check Nextflow pipeline minimum version - NXF_VER: '19.10.0' + NXF_VER: '20.04.1' - name: Pull docker image run: | docker pull nfcore/sarek:dev diff --git a/CHANGELOG.md b/CHANGELOG.md index 7764099969..ec13b524b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,10 +6,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a ## [dev](https://github.com/nf-core/sarek/tree/dev) +- Switching to DSL2 + ### Added ### Changed +- Update Nextflow `19.10.0` -> `20.04.1` + ### Fixed - [#229](https://github.com/nf-core/sarek/pull/229) - Fix `Control-FREEC` restart issue [#225](https://github.com/nf-core/sarek/issues/225) diff --git a/README.md b/README.md index adf95812eb..bb70fe3771 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ > **An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing** -[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A519.10.0-brightgreen.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A520.04.1-brightgreen.svg)](https://www.nextflow.io/) [![nf-core](https://img.shields.io/badge/nf--core-pipeline-brightgreen.svg)](https://nf-co.re/) [![DOI](https://zenodo.org/badge/184289291.svg)](https://zenodo.org/badge/latestdoi/184289291) From 442ad0ded7c4a2a37791c318e11a8892cb59e0e6 Mon Sep 17 00:00:00 2001 From: ggabernet Date: Tue, 14 Jul 2020 11:17:11 +0200 Subject: [PATCH 004/200] move sarek functions to functions.nf --- main.nf | 280 +------------------------------------ modules/local/functions.nf | 275 ++++++++++++++++++++++++++++++++++++ 2 files changed, 280 insertions(+), 275 deletions(-) create mode 100644 modules/local/functions.nf diff --git a/main.nf b/main.nf index d1cc24b18c..fcb8687a93 100644 --- a/main.nf +++ b/main.nf @@ -283,6 +283,11 @@ workflow { ch_workflow_summary) } +/* +================================================================================ + GET software versions +================================================================================ +*/ // process Get_software_versions { // publishDir path:"${params.outdir}/pipeline_info", mode: params.publish_dir_mode, // saveAs: {it.indexOf(".csv") > 0 ? it : null} @@ -3577,278 +3582,3 @@ workflow { // } // } -// /* -// ================================================================================ -// sarek functions -// ================================================================================ -// */ - -// // Check if a row has the expected number of item -// def checkNumberOfItem(row, number) { -// if (row.size() != number) exit 1, "Malformed row in TSV file: ${row}, see --help for more information" -// return true -// } - -// // Check parameter existence -// def checkParameterExistence(it, list) { -// if (!list.contains(it)) { -// log.warn "Unknown parameter: ${it}" -// return false -// } -// return true -// } - -// // Compare each parameter with a list of parameters -// def checkParameterList(list, realList) { -// return list.every{ checkParameterExistence(it, realList) } -// } - -// // Define list of available tools to annotate -// def defineAnnoList() { -// return [ -// 'haplotypecaller', -// 'manta', -// 'mutect2', -// 'strelka', -// 'tiddit' -// ] -// } - -// // Define list of skipable QC tools -// def defineSkipQClist() { -// return [ -// 'bamqc', -// 'baserecalibrator', -// 'bcftools', -// 'documentation', -// 'fastqc', -// 'markduplicates', -// 'multiqc', -// 'samtools', -// 'sentieon', -// 'vcftools', -// 'versions' -// ] -// } - -// // Define list of available step -// def defineStepList() { -// return [ -// 'annotate', -// 'controlfreec', -// 'mapping', -// 'preparerecalibration', -// 'recalibrate', -// 'variantcalling' -// ] -// } - -// // Define list of available tools -// def defineToolList() { -// return [ -// 'ascat', -// 'cnvkit', -// 'controlfreec', -// 'dnascope', -// 'dnaseq', -// 'freebayes', -// 'haplotypecaller', -// 'manta', -// 'merge', -// 'mpileup', -// 'mutect2', -// 'snpeff', -// 'strelka', -// 'tiddit', -// 'tnscope', -// 'vep', -// 'msisensor' -// ] -// } - -// // Channeling the TSV file containing BAM. -// // Format is: "subject gender status sample bam bai" -// def extractBam(tsvFile) { -// Channel.from(tsvFile) -// .splitCsv(sep: '\t') -// .map { row -> -// checkNumberOfItem(row, 6) -// def idPatient = row[0] -// def gender = row[1] -// def status = returnStatus(row[2].toInteger()) -// def idSample = row[3] -// def bamFile = returnFile(row[4]) -// def baiFile = returnFile(row[5]) - -// if (!hasExtension(bamFile, "bam")) exit 1, "File: ${bamFile} has the wrong extension. See --help for more information" -// if (!hasExtension(baiFile, "bai")) exit 1, "File: ${baiFile} has the wrong extension. See --help for more information" - -// return [idPatient, gender, status, idSample, bamFile, baiFile] -// } -// } - -// // Create a channel of germline FASTQs from a directory pattern: "my_samples/*/" -// // All FASTQ files in subdirectories are collected and emitted; -// // they must have _R1_ and _R2_ in their names. -// def extractFastqFromDir(pattern) { -// def fastq = Channel.create() -// // a temporary channel does all the work -// Channel -// .fromPath(pattern, type: 'dir') -// .ifEmpty { error "No directories found matching pattern '${pattern}'" } -// .subscribe onNext: { sampleDir -> -// // the last name of the sampleDir is assumed to be a unique sample id -// sampleId = sampleDir.getFileName().toString() - -// for (path1 in file("${sampleDir}/**_R1_*.fastq.gz")) { -// assert path1.getName().contains('_R1_') -// path2 = file(path1.toString().replace('_R1_', '_R2_')) -// if (!path2.exists()) error "Path '${path2}' not found" -// (flowcell, lane) = flowcellLaneFromFastq(path1) -// patient = sampleId -// gender = 'ZZ' // unused -// status = 0 // normal (not tumor) -// rgId = "${flowcell}.${sampleId}.${lane}" -// result = [patient, gender, status, sampleId, rgId, path1, path2] -// fastq.bind(result) -// } -// }, onComplete: { fastq.close() } -// fastq -// } - -// // Extract gender and status from Channel -// def extractInfos(channel) { -// def genderMap = [:] -// def statusMap = [:] -// channel = channel.map{ it -> -// def idPatient = it[0] -// def gender = it[1] -// def status = it[2] -// def idSample = it[3] -// genderMap[idPatient] = gender -// statusMap[idPatient, idSample] = status -// [idPatient] + it[3..-1] -// } -// [genderMap, statusMap, channel] -// } - -// // Channeling the TSV file containing FASTQ or BAM -// // Format is: "subject gender status sample lane fastq1 fastq2" -// // or: "subject gender status sample lane bam" -// def extractFastq(tsvFile) { -// Channel.from(tsvFile) -// .splitCsv(sep: '\t') -// .map { row -> -// def idPatient = row[0] -// def gender = row[1] -// def status = returnStatus(row[2].toInteger()) -// def idSample = row[3] -// def idRun = row[4] -// def file1 = returnFile(row[5]) -// def file2 = "null" -// if (hasExtension(file1, "fastq.gz") || hasExtension(file1, "fq.gz") || hasExtension(file1, "fastq") || hasExtension(file1, "fq")) { -// checkNumberOfItem(row, 7) -// file2 = returnFile(row[6]) -// if (!hasExtension(file2, "fastq.gz") && !hasExtension(file2, "fq.gz") && !hasExtension(file2, "fastq") && !hasExtension(file2, "fq")) exit 1, "File: ${file2} has the wrong extension. See --help for more information" -// if (hasExtension(file1, "fastq") || hasExtension(file1, "fq") || hasExtension(file2, "fastq") || hasExtension(file2, "fq")) { -// exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint." -// } -// } -// else if (hasExtension(file1, "bam")) checkNumberOfItem(row, 6) -// else "No recognisable extention for input file: ${file1}" - -// [idPatient, gender, status, idSample, idRun, file1, file2] -// } -// } - -// // Channeling the TSV file containing mpileup -// // Format is: "subject gender status sample pileup" -// def extractPileup(tsvFile) { -// Channel.from(tsvFile) -// .splitCsv(sep: '\t') -// .map { row -> -// checkNumberOfItem(row, 5) -// def idPatient = row[0] -// def gender = row[1] -// def status = returnStatus(row[2].toInteger()) -// def idSample = row[3] -// def mpileup = returnFile(row[4]) - -// if (!hasExtension(mpileup, "pileup")) exit 1, "File: ${mpileup} has the wrong extension. See --help for more information" - -// return [idPatient, gender, status, idSample, mpileup] -// } -// } - -// // Channeling the TSV file containing Recalibration Tables. -// // Format is: "subject gender status sample bam bai recalTable" -// def extractRecal(tsvFile) { -// Channel.from(tsvFile) -// .splitCsv(sep: '\t') -// .map { row -> -// checkNumberOfItem(row, 7) -// def idPatient = row[0] -// def gender = row[1] -// def status = returnStatus(row[2].toInteger()) -// def idSample = row[3] -// def bamFile = returnFile(row[4]) -// def baiFile = returnFile(row[5]) -// def recalTable = returnFile(row[6]) - -// if (!hasExtension(bamFile, "bam")) exit 1, "File: ${bamFile} has the wrong extension. See --help for more information" -// if (!hasExtension(baiFile, "bai")) exit 1, "File: ${baiFile} has the wrong extension. See --help for more information" -// if (!hasExtension(recalTable, "recal.table")) exit 1, "File: ${recalTable} has the wrong extension. See --help for more information" - -// [idPatient, gender, status, idSample, bamFile, baiFile, recalTable] -// } -// } - -// // Parse first line of a FASTQ file, return the flowcell id and lane number. -// def flowcellLaneFromFastq(path) { -// // expected format: -// // xx:yy:FLOWCELLID:LANE:... (seven fields) -// // or -// // FLOWCELLID:LANE:xx:... (five fields) -// InputStream fileStream = new FileInputStream(path.toFile()) -// InputStream gzipStream = new java.util.zip.GZIPInputStream(fileStream) -// Reader decoder = new InputStreamReader(gzipStream, 'ASCII') -// BufferedReader buffered = new BufferedReader(decoder) -// def line = buffered.readLine() -// assert line.startsWith('@') -// line = line.substring(1) -// def fields = line.split(' ')[0].split(':') -// String fcid -// int lane -// if (fields.size() == 7) { -// // CASAVA 1.8+ format -// fcid = fields[2] -// lane = fields[3].toInteger() -// } else if (fields.size() == 5) { -// fcid = fields[0] -// lane = fields[1].toInteger() -// } -// [fcid, lane] -// } - -// // Check file extension -// def hasExtension(it, extension) { -// it.toString().toLowerCase().endsWith(extension.toLowerCase()) -// } - -// // Return file if it exists -// def returnFile(it) { -// if (!file(it).exists()) exit 1, "Missing file in TSV file: ${it}, see --help for more information" -// return file(it) -// } - -// // Remove .ann .gz and .vcf extension from a VCF file -// def reduceVCF(file) { -// return file.fileName.toString().minus(".ann").minus(".vcf").minus(".gz") -// } - -// // Return status [0,1] -// // 0 == Normal, 1 == Tumor -// def returnStatus(it) { -// if (!(it in [0, 1])) exit 1, "Status is not recognized in TSV file: ${it}, see --help for more information" -// return it -// } diff --git a/modules/local/functions.nf b/modules/local/functions.nf new file mode 100644 index 0000000000..3e9bb62ccd --- /dev/null +++ b/modules/local/functions.nf @@ -0,0 +1,275 @@ +/* +================================================================================ + sarek functions +================================================================================ +*/ + +// Check if a row has the expected number of item +def checkNumberOfItem(row, number) { + if (row.size() != number) exit 1, "Malformed row in TSV file: ${row}, see --help for more information" + return true +} + +// Check parameter existence +def checkParameterExistence(it, list) { + if (!list.contains(it)) { + log.warn "Unknown parameter: ${it}" + return false + } + return true +} + +// Compare each parameter with a list of parameters +def checkParameterList(list, realList) { + return list.every{ checkParameterExistence(it, realList) } +} + +// Define list of available tools to annotate +def defineAnnoList() { + return [ + 'haplotypecaller', + 'manta', + 'mutect2', + 'strelka', + 'tiddit' + ] +} + +// Define list of skipable QC tools +def defineSkipQClist() { + return [ + 'bamqc', + 'baserecalibrator', + 'bcftools', + 'documentation', + 'fastqc', + 'markduplicates', + 'multiqc', + 'samtools', + 'sentieon', + 'vcftools', + 'versions' + ] +} + +// Define list of available step +def defineStepList() { + return [ + 'annotate', + 'controlfreec', + 'mapping', + 'preparerecalibration', + 'recalibrate', + 'variantcalling' + ] +} + +// Define list of available tools +def defineToolList() { + return [ + 'ascat', + 'cnvkit', + 'controlfreec', + 'dnascope', + 'dnaseq', + 'freebayes', + 'haplotypecaller', + 'manta', + 'merge', + 'mpileup', + 'mutect2', + 'snpeff', + 'strelka', + 'tiddit', + 'tnscope', + 'vep', + 'msisensor' + ] +} + +// Channeling the TSV file containing BAM. +// Format is: "subject gender status sample bam bai" +def extractBam(tsvFile) { + Channel.from(tsvFile) + .splitCsv(sep: '\t') + .map { row -> + checkNumberOfItem(row, 6) + def idPatient = row[0] + def gender = row[1] + def status = returnStatus(row[2].toInteger()) + def idSample = row[3] + def bamFile = returnFile(row[4]) + def baiFile = returnFile(row[5]) + + if (!hasExtension(bamFile, "bam")) exit 1, "File: ${bamFile} has the wrong extension. See --help for more information" + if (!hasExtension(baiFile, "bai")) exit 1, "File: ${baiFile} has the wrong extension. See --help for more information" + + return [idPatient, gender, status, idSample, bamFile, baiFile] + } +} + +// Create a channel of germline FASTQs from a directory pattern: "my_samples/*/" +// All FASTQ files in subdirectories are collected and emitted; +// they must have _R1_ and _R2_ in their names. +def extractFastqFromDir(pattern) { + def fastq = Channel.create() + // a temporary channel does all the work + Channel + .fromPath(pattern, type: 'dir') + .ifEmpty { error "No directories found matching pattern '${pattern}'" } + .subscribe onNext: { sampleDir -> + // the last name of the sampleDir is assumed to be a unique sample id + sampleId = sampleDir.getFileName().toString() + + for (path1 in file("${sampleDir}/**_R1_*.fastq.gz")) { + assert path1.getName().contains('_R1_') + path2 = file(path1.toString().replace('_R1_', '_R2_')) + if (!path2.exists()) error "Path '${path2}' not found" + (flowcell, lane) = flowcellLaneFromFastq(path1) + patient = sampleId + gender = 'ZZ' // unused + status = 0 // normal (not tumor) + rgId = "${flowcell}.${sampleId}.${lane}" + result = [patient, gender, status, sampleId, rgId, path1, path2] + fastq.bind(result) + } + }, onComplete: { fastq.close() } + fastq +} + +// Extract gender and status from Channel +def extractInfos(channel) { + def genderMap = [:] + def statusMap = [:] + channel = channel.map{ it -> + def idPatient = it[0] + def gender = it[1] + def status = it[2] + def idSample = it[3] + genderMap[idPatient] = gender + statusMap[idPatient, idSample] = status + [idPatient] + it[3..-1] + } + [genderMap, statusMap, channel] +} + +// Channeling the TSV file containing FASTQ or BAM +// Format is: "subject gender status sample lane fastq1 fastq2" +// or: "subject gender status sample lane bam" +def extractFastq(tsvFile) { + Channel.from(tsvFile) + .splitCsv(sep: '\t') + .map { row -> + def idPatient = row[0] + def gender = row[1] + def status = returnStatus(row[2].toInteger()) + def idSample = row[3] + def idRun = row[4] + def file1 = returnFile(row[5]) + def file2 = "null" + if (hasExtension(file1, "fastq.gz") || hasExtension(file1, "fq.gz") || hasExtension(file1, "fastq") || hasExtension(file1, "fq")) { + checkNumberOfItem(row, 7) + file2 = returnFile(row[6]) + if (!hasExtension(file2, "fastq.gz") && !hasExtension(file2, "fq.gz") && !hasExtension(file2, "fastq") && !hasExtension(file2, "fq")) exit 1, "File: ${file2} has the wrong extension. See --help for more information" + if (hasExtension(file1, "fastq") || hasExtension(file1, "fq") || hasExtension(file2, "fastq") || hasExtension(file2, "fq")) { + exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint." + } + } + else if (hasExtension(file1, "bam")) checkNumberOfItem(row, 6) + else "No recognisable extention for input file: ${file1}" + + [idPatient, gender, status, idSample, idRun, file1, file2] + } +} + +// Channeling the TSV file containing mpileup +// Format is: "subject gender status sample pileup" +def extractPileup(tsvFile) { + Channel.from(tsvFile) + .splitCsv(sep: '\t') + .map { row -> + checkNumberOfItem(row, 5) + def idPatient = row[0] + def gender = row[1] + def status = returnStatus(row[2].toInteger()) + def idSample = row[3] + def mpileup = returnFile(row[4]) + + if (!hasExtension(mpileup, "pileup")) exit 1, "File: ${mpileup} has the wrong extension. See --help for more information" + + return [idPatient, gender, status, idSample, mpileup] + } +} + +// Channeling the TSV file containing Recalibration Tables. +// Format is: "subject gender status sample bam bai recalTable" +def extractRecal(tsvFile) { + Channel.from(tsvFile) + .splitCsv(sep: '\t') + .map { row -> + checkNumberOfItem(row, 7) + def idPatient = row[0] + def gender = row[1] + def status = returnStatus(row[2].toInteger()) + def idSample = row[3] + def bamFile = returnFile(row[4]) + def baiFile = returnFile(row[5]) + def recalTable = returnFile(row[6]) + + if (!hasExtension(bamFile, "bam")) exit 1, "File: ${bamFile} has the wrong extension. See --help for more information" + if (!hasExtension(baiFile, "bai")) exit 1, "File: ${baiFile} has the wrong extension. See --help for more information" + if (!hasExtension(recalTable, "recal.table")) exit 1, "File: ${recalTable} has the wrong extension. See --help for more information" + + [idPatient, gender, status, idSample, bamFile, baiFile, recalTable] + } +} + +// Parse first line of a FASTQ file, return the flowcell id and lane number. +def flowcellLaneFromFastq(path) { + // expected format: + // xx:yy:FLOWCELLID:LANE:... (seven fields) + // or + // FLOWCELLID:LANE:xx:... (five fields) + InputStream fileStream = new FileInputStream(path.toFile()) + InputStream gzipStream = new java.util.zip.GZIPInputStream(fileStream) + Reader decoder = new InputStreamReader(gzipStream, 'ASCII') + BufferedReader buffered = new BufferedReader(decoder) + def line = buffered.readLine() + assert line.startsWith('@') + line = line.substring(1) + def fields = line.split(' ')[0].split(':') + String fcid + int lane + if (fields.size() == 7) { + // CASAVA 1.8+ format + fcid = fields[2] + lane = fields[3].toInteger() + } else if (fields.size() == 5) { + fcid = fields[0] + lane = fields[1].toInteger() + } + [fcid, lane] +} + +// Check file extension +def hasExtension(it, extension) { + it.toString().toLowerCase().endsWith(extension.toLowerCase()) +} + +// Return file if it exists +def returnFile(it) { + if (!file(it).exists()) exit 1, "Missing file in TSV file: ${it}, see --help for more information" + return file(it) +} + +// Remove .ann .gz and .vcf extension from a VCF file +def reduceVCF(file) { + return file.fileName.toString().minus(".ann").minus(".vcf").minus(".gz") +} + +// Return status [0,1] +// 0 == Normal, 1 == Tumor +def returnStatus(it) { + if (!(it in [0, 1])) exit 1, "Status is not recognized in TSV file: ${it}, see --help for more information" + return it +} From 07c2d1b6fee7893056c5a1697235225eb612adec Mon Sep 17 00:00:00 2001 From: ggabernet Date: Tue, 14 Jul 2020 11:21:30 +0200 Subject: [PATCH 005/200] clean nextflow functions --- main.nf | 193 ++++---------------------------------------------------- 1 file changed, 11 insertions(+), 182 deletions(-) diff --git a/main.nf b/main.nf index fcb8687a93..040840a1d7 100644 --- a/main.nf +++ b/main.nf @@ -283,6 +283,17 @@ workflow { ch_workflow_summary) } +/* +================================================================================ + SEND COMPLETION EMAIL +================================================================================ + */ +workflow.onComplete { + def multiqc_report = [] + Completion.email(workflow, params, summary, run_name, baseDir, multiqc_report, log) + Completion.summary(workflow, params, log) +} + /* ================================================================================ GET software versions @@ -3399,186 +3410,4 @@ workflow { // """ // } -// // Completion e-mail notification -// workflow.onComplete { - -// // Set up the e-mail variables -// def subject = "[nf-core/sarek] Successful: $workflow.runName" -// if (!workflow.success) { -// subject = "[nf-core/sarek] FAILED: $workflow.runName" -// } -// def email_fields = [:] -// email_fields['version'] = workflow.manifest.version -// email_fields['runName'] = custom_runName ?: workflow.runName -// email_fields['success'] = workflow.success -// email_fields['dateComplete'] = workflow.complete -// email_fields['duration'] = workflow.duration -// email_fields['exitStatus'] = workflow.exitStatus -// email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') -// email_fields['errorReport'] = (workflow.errorReport ?: 'None') -// email_fields['commandLine'] = workflow.commandLine -// email_fields['projectDir'] = workflow.projectDir -// email_fields['summary'] = summary -// email_fields['summary']['Date Started'] = workflow.start -// email_fields['summary']['Date Completed'] = workflow.complete -// email_fields['summary']['Pipeline script file path'] = workflow.scriptFile -// email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId -// if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository -// if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId -// if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision -// email_fields['summary']['Nextflow Version'] = workflow.nextflow.version -// email_fields['summary']['Nextflow Build'] = workflow.nextflow.build -// email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp - -// def mqc_report = null -// try { -// if (workflow.success) { -// mqc_report = ch_multiqc_report.getVal() -// if (mqc_report.getClass() == ArrayList) { -// log.warn "[nf-core/sarek] Found multiple reports from process 'multiqc', will use only one" -// mqc_report = mqc_report[0] -// } -// } -// } catch (all) { -// log.warn "[nf-core/sarek] Could not attach MultiQC report to summary email" -// } - -// // Check if we are only sending emails on failure -// email_address = params.email -// if (!params.email && params.email_on_fail && !workflow.success) { -// email_address = params.email_on_fail -// } - -// // Render the TXT template -// def engine = new groovy.text.GStringTemplateEngine() -// def tf = new File("$baseDir/assets/email_template.txt") -// def txt_template = engine.createTemplate(tf).make(email_fields) -// def email_txt = txt_template.toString() - -// // Render the HTML template -// def hf = new File("$baseDir/assets/email_template.html") -// def html_template = engine.createTemplate(hf).make(email_fields) -// def email_html = html_template.toString() - -// // Render the sendmail template -// def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$baseDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] -// def sf = new File("$baseDir/assets/sendmail_template.txt") -// def sendmail_template = engine.createTemplate(sf).make(smail_fields) -// def sendmail_html = sendmail_template.toString() - -// // Send the HTML e-mail -// if (email_address) { -// try { -// if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } -// // Try to send HTML e-mail using sendmail -// [ 'sendmail', '-t' ].execute() << sendmail_html -// log.info "[nf-core/sarek] Sent summary e-mail to $email_address (sendmail)" -// } catch (all) { -// // Catch failures and try with plaintext -// [ 'mail', '-s', subject, email_address ].execute() << email_txt -// log.info "[nf-core/sarek] Sent summary e-mail to $email_address (mail)" -// } -// } - -// // Write summary e-mail HTML to a file -// def output_d = new File("${params.outdir}/pipeline_info/") -// if (!output_d.exists()) { -// output_d.mkdirs() -// } -// def output_hf = new File(output_d, "pipeline_report.html") -// output_hf.withWriter { w -> w << email_html } -// def output_tf = new File(output_d, "pipeline_report.txt") -// output_tf.withWriter { w -> w << email_txt } - -// c_green = params.monochrome_logs ? '' : "\033[0;32m"; -// c_purple = params.monochrome_logs ? '' : "\033[0;35m"; -// c_red = params.monochrome_logs ? '' : "\033[0;31m"; -// c_reset = params.monochrome_logs ? '' : "\033[0m"; - -// if (workflow.stats.ignoredCount > 0 && workflow.success) { -// log.info "-${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}-" -// log.info "-${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}-" -// log.info "-${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}-" -// } - -// if (workflow.success) { -// log.info "-${c_purple}[nf-core/sarek]${c_green} Pipeline completed successfully${c_reset}-" -// } else { -// checkHostname() -// log.info "-${c_purple}[nf-core/sarek]${c_red} Pipeline completed with errors${c_reset}-" -// } -// } - -// /* -// ================================================================================ -// nf-core functions -// ================================================================================ -// */ - -// def create_workflow_summary(summary) { -// def yaml_file = workDir.resolve('workflow_summary_mqc.yaml') -// yaml_file.text = """ -// id: 'nf-core-sarek-summary' -// description: " - this information is collected when the pipeline is started." -// section_name: 'nf-core/sarek Workflow Summary' -// section_href: 'https://github.com/nf-core/sarek' -// plot_type: 'html' -// data: | -//
-// ${summary.collect { k, v -> "
$k
${v ?: 'N/A'}
" }.join("\n")} -//
-// """.stripIndent() - -// return yaml_file -// } - -// def nfcoreHeader() { -// // Log colors ANSI codes -// c_black = params.monochrome_logs ? '' : "\033[0;30m"; -// c_blue = params.monochrome_logs ? '' : "\033[0;34m"; -// c_dim = params.monochrome_logs ? '' : "\033[2m"; -// c_green = params.monochrome_logs ? '' : "\033[0;32m"; -// c_purple = params.monochrome_logs ? '' : "\033[0;35m"; -// c_reset = params.monochrome_logs ? '' : "\033[0m"; -// c_white = params.monochrome_logs ? '' : "\033[0;37m"; -// c_yellow = params.monochrome_logs ? '' : "\033[0;33m"; - -// return """ -${c_dim}--------------------------------------------------${c_reset}- -// ${c_green},--.${c_black}/${c_green},-.${c_reset} -// ${c_blue} ___ __ __ __ ___ ${c_green}/,-._.--~\'${c_reset} -// ${c_blue} |\\ | |__ __ / ` / \\ |__) |__ ${c_yellow}} {${c_reset} -// ${c_blue} | \\| | \\__, \\__/ | \\ |___ ${c_green}\\`-._,-`-,${c_reset} -// ${c_green}`._,._,\'${c_reset} -// ${c_white}____${c_reset} -// ${c_white}.´ _ `.${c_reset} -// ${c_white}/ ${c_green}|\\${c_reset}`-_ \\${c_reset} ${c_blue} __ __ ___ ${c_reset} -// ${c_white}| ${c_green}| \\${c_reset} `-|${c_reset} ${c_blue}|__` /\\ |__) |__ |__/${c_reset} -// ${c_white}\\ ${c_green}| \\${c_reset} /${c_reset} ${c_blue}.__| /¯¯\\ | \\ |___ | \\${c_reset} -// ${c_white}`${c_green}|${c_reset}____${c_green}\\${c_reset}´${c_reset} - -// ${c_purple} nf-core/sarek v${workflow.manifest.version}${c_reset} -// -${c_dim}--------------------------------------------------${c_reset}- -// """.stripIndent() -// } - -// def checkHostname() { -// def c_reset = params.monochrome_logs ? '' : "\033[0m" -// def c_white = params.monochrome_logs ? '' : "\033[0;37m" -// def c_red = params.monochrome_logs ? '' : "\033[1;91m" -// def c_yellow_bold = params.monochrome_logs ? '' : "\033[1;93m" -// if (params.hostnames) { -// def hostname = "hostname".execute().text.trim() -// params.hostnames.each { prof, hnames -> -// hnames.each { hname -> -// if (hostname.contains(hname) && !workflow.profile.contains(prof)) { -// log.error "====================================================\n" + -// " ${c_red}WARNING!${c_reset} You are running with `-profile $workflow.profile`\n" + -// " but your machine hostname is ${c_white}'$hostname'${c_reset}\n" + -// " ${c_yellow_bold}It's highly recommended that you use `-profile $prof${c_reset}`\n" + -// "============================================================" -// } -// } -// } -// } -// } From 8e1d0b28cea96aca697cde280dd3751b4cb88688 Mon Sep 17 00:00:00 2001 From: ggabernet Date: Tue, 14 Jul 2020 15:59:15 +0200 Subject: [PATCH 006/200] commenting out unnecessary stuff for now --- conf/test.config | 26 +++-- lib/Headers.groovy | 2 +- main.nf | 236 ++++++++++++++++++++++++--------------------- 3 files changed, 144 insertions(+), 120 deletions(-) diff --git a/conf/test.config b/conf/test.config index 50f523c9b2..685a29af7e 100644 --- a/conf/test.config +++ b/conf/test.config @@ -18,7 +18,8 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/sarek/testdata/tsv/tiny-manta-https.tsv' - +/* + * TODO: uncomment when ready // Small reference genome igenomes_ignore = true genome = 'smallGRCh37' @@ -26,15 +27,20 @@ params { snpeff_db = 'WBcel235.86' species = 'caenorhabditis_elegans' vep_cache_version = '99' +*/ + } -process { - withName:Snpeff { - container = 'nfcore/sareksnpeff:dev.WBcel235' - maxForks = 1 - } - withLabel:VEP { - container = 'nfcore/sarekvep:dev.WBcel235' - maxForks = 1 - } +/* + * TODO: uncomment when ready + process { + withName:Snpeff { + container = 'nfcore/sareksnpeff:dev.WBcel235' + maxForks = 1 + } + withLabel:VEP { + container = 'nfcore/sarekvep:dev.WBcel235' + maxForks = 1 + } } +*/ diff --git a/lib/Headers.groovy b/lib/Headers.groovy index 447141125e..8fd9c8c6fc 100644 --- a/lib/Headers.groovy +++ b/lib/Headers.groovy @@ -20,7 +20,7 @@ class Headers { return colorcodes } - static String nf_core(workflow, monochrome_logs) {x + static String nf_core(workflow, monochrome_logs) { Map colors = log_colours(monochrome_logs) String.format( """\n diff --git a/main.nf b/main.nf index 040840a1d7..3987c09b4b 100644 --- a/main.nf +++ b/main.nf @@ -31,6 +31,19 @@ if (params.help) { exit 0 } +/* +================================================================================ + INCLUDE SAREK FUNCTIONS +================================================================================ +*/ +include { hasExtension; + defineStepList; + extractFastq; + extractInfos; + defineToolList; + checkParameterList; + extractBam; + extractFastqFromDir } from './modules/local/functions' /* ================================================================================ @@ -51,39 +64,40 @@ Checks.hostname(workflow, params, log) // Check the hostnames against configured ch_multiqc_config = file("$baseDir/assets/multiqc_config.yaml", checkIfExists: true) ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() ch_output_docs = file("$baseDir/docs/output.md", checkIfExists: true) +ch_output_docs_images = file("$baseDir/docs/images/", checkIfExists: true) -// Check if genome exists in the config file -if (params.genomes && !params.genomes.containsKey(params.genome) && !params.igenomes_ignore) { - exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" -} else if (params.genomes && !params.genomes.containsKey(params.genome) && params.igenomes_ignore) { - exit 1, "The provided genome '${params.genome}' is not available in the genomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" -} +// // Check if genome exists in the config file +// if (params.genomes && !params.genomes.containsKey(params.genome) && !params.igenomes_ignore) { +// exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" +// } else if (params.genomes && !params.genomes.containsKey(params.genome) && params.igenomes_ignore) { +// exit 1, "The provided genome '${params.genome}' is not available in the genomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" +// } stepList = defineStepList() step = params.step ? params.step.toLowerCase().replaceAll('-', '').replaceAll('_', '') : '' -// Handle deprecation -if (step == 'preprocessing') step = 'mapping' +// // Handle deprecation +// if (step == 'preprocessing') step = 'mapping' -if (step.contains(',')) exit 1, 'You can choose only one step, see --help for more information' -if (!checkParameterExistence(step, stepList)) exit 1, "Unknown step ${step}, see --help for more information" +// if (step.contains(',')) exit 1, 'You can choose only one step, see --help for more information' +// if (!checkParameterExistence(step, stepList)) exit 1, "Unknown step ${step}, see --help for more information" toolList = defineToolList() tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] if (step == 'controlfreec') tools = ['controlfreec'] if (!checkParameterList(tools, toolList)) exit 1, 'Unknown tool(s), see --help for more information' -skipQClist = defineSkipQClist() -skipQC = params.skip_qc ? params.skip_qc == 'all' ? skipQClist : params.skip_qc.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] -if (!checkParameterList(skipQC, skipQClist)) exit 1, 'Unknown QC tool(s), see --help for more information' +// skipQClist = defineSkipQClist() +// skipQC = params.skip_qc ? params.skip_qc == 'all' ? skipQClist : params.skip_qc.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] +// if (!checkParameterList(skipQC, skipQClist)) exit 1, 'Unknown QC tool(s), see --help for more information' -annoList = defineAnnoList() -annotateTools = params.annotate_tools ? params.annotate_tools.split(',').collect{it.trim().toLowerCase().replaceAll('-', '')} : [] -if (!checkParameterList(annotateTools,annoList)) exit 1, 'Unknown tool(s) to annotate, see --help for more information' +// annoList = defineAnnoList() +// annotateTools = params.annotate_tools ? params.annotate_tools.split(',').collect{it.trim().toLowerCase().replaceAll('-', '')} : [] +// if (!checkParameterList(annotateTools,annoList)) exit 1, 'Unknown tool(s) to annotate, see --help for more information' -// Check parameters -if ((params.ascat_ploidy && !params.ascat_purity) || (!params.ascat_ploidy && params.ascat_purity)) exit 1, 'Please specify both --ascat_purity and --ascat_ploidy, or none of them' -if (params.cf_window && params.cf_coeff) exit 1, 'Please specify either --cf_window OR --cf_coeff, but not both of them' +// // Check parameters +// if ((params.ascat_ploidy && !params.ascat_purity) || (!params.ascat_ploidy && params.ascat_purity)) exit 1, 'Please specify both --ascat_purity and --ascat_ploidy, or none of them' +// if (params.cf_window && params.cf_coeff) exit 1, 'Please specify either --cf_window OR --cf_coeff, but not both of them' // Handle input @@ -91,7 +105,7 @@ tsvPath = null if (params.input && (hasExtension(params.input, "tsv") || hasExtension(params.input, "vcf") || hasExtension(params.input, "vcf.gz"))) tsvPath = params.input if (params.input && (hasExtension(params.input, "vcf") || hasExtension(params.input, "vcf.gz"))) step = "annotate" -save_bam_mapped = params.skip_markduplicates ? true : params.save_bam_mapped ? true : false +// save_bam_mapped = params.skip_markduplicates ? true : params.save_bam_mapped ? true : false // If no input file specified, trying to get TSV files corresponding to step in the TSV directory // only for steps preparerecalibration, recalibrate, variantcalling and controlfreec @@ -156,60 +170,62 @@ if (tsvPath) { (genderMap, statusMap, inputSample) = extractInfos(inputSample) +inputSample.dump(tag: 'input sample') + /* ================================================================================ CHECKING REFERENCES ================================================================================ */ -// Initialize each params in params.genomes, catch the command line first if it was defined -// params.fasta has to be the first one -params.fasta = params.genome && !('annotate' in step) ? params.genomes[params.genome].fasta ?: null : null -// The rest can be sorted -params.ac_loci = params.genome && 'ascat' in tools ? params.genomes[params.genome].ac_loci ?: null : null -params.ac_loci_gc = params.genome && 'ascat' in tools ? params.genomes[params.genome].ac_loci_gc ?: null : null -params.bwa = params.genome && params.fasta && 'mapping' in step ? params.genomes[params.genome].bwa ?: null : null -params.chr_dir = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].chr_dir ?: null : null -params.chr_length = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].chr_length ?: null : null -params.dbsnp = params.genome && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? params.genomes[params.genome].dbsnp ?: null : null -params.dbsnp_index = params.genome && params.dbsnp ? params.genomes[params.genome].dbsnp_index ?: null : null -params.dict = params.genome && params.fasta ? params.genomes[params.genome].dict ?: null : null -params.fasta_fai = params.genome && params.fasta ? params.genomes[params.genome].fasta_fai ?: null : null -params.germline_resource = params.genome && 'mutect2' in tools ? params.genomes[params.genome].germline_resource ?: null : null -params.germline_resource_index = params.genome && params.germline_resource ? params.genomes[params.genome].germline_resource_index ?: null : null -params.intervals = params.genome && !('annotate' in step) ? params.genomes[params.genome].intervals ?: null : null -params.known_indels = params.genome && ('mapping' in step || 'preparerecalibration' in step) ? params.genomes[params.genome].known_indels ?: null : null -params.known_indels_index = params.genome && params.known_indels ? params.genomes[params.genome].known_indels_index ?: null : null -params.mappability = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].mappability ?: null : null -params.snpeff_db = params.genome && 'snpeff' in tools ? params.genomes[params.genome].snpeff_db ?: null : null -params.species = params.genome && 'vep' in tools ? params.genomes[params.genome].species ?: null : null -params.vep_cache_version = params.genome && 'vep' in tools ? params.genomes[params.genome].vep_cache_version ?: null : null - -// Initialize channels based on params -ch_ac_loci = params.ac_loci && 'ascat' in tools ? Channel.value(file(params.ac_loci)) : "null" -ch_ac_loci_gc = params.ac_loci_gc && 'ascat' in tools ? Channel.value(file(params.ac_loci_gc)) : "null" -ch_chr_dir = params.chr_dir && 'controlfreec' in tools ? Channel.value(file(params.chr_dir)) : "null" -ch_chr_length = params.chr_length && 'controlfreec' in tools ? Channel.value(file(params.chr_length)) : "null" -ch_dbsnp = params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? Channel.value(file(params.dbsnp)) : "null" -ch_fasta = params.fasta && !('annotate' in step) ? Channel.value(file(params.fasta)) : "null" -ch_fai = params.fasta_fai && !('annotate' in step) ? Channel.value(file(params.fasta_fai)) : "null" -ch_germline_resource = params.germline_resource && 'mutect2' in tools ? Channel.value(file(params.germline_resource)) : "null" -ch_intervals = params.intervals && !params.no_intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : "null" -ch_known_indels = params.known_indels && ('mapping' in step || 'preparerecalibration' in step) ? Channel.value(file(params.known_indels)) : "null" -ch_mappability = params.mappability && 'controlfreec' in tools ? Channel.value(file(params.mappability)) : "null" - -ch_snpeff_cache = params.snpeff_cache ? Channel.value(file(params.snpeff_cache)) : "null" -ch_snpeff_db = params.snpeff_db ? Channel.value(params.snpeff_db) : "null" -ch_vep_cache_version = params.vep_cache_version ? Channel.value(params.vep_cache_version) : "null" -ch_vep_cache = params.vep_cache ? Channel.value(file(params.vep_cache)) : "null" - -// Optional files, not defined within the params.genomes[params.genome] scope -ch_cadd_indels = params.cadd_indels ? Channel.value(file(params.cadd_indels)) : "null" -ch_cadd_indels_tbi = params.cadd_indels_tbi ? Channel.value(file(params.cadd_indels_tbi)) : "null" -ch_cadd_wg_snvs = params.cadd_wg_snvs ? Channel.value(file(params.cadd_wg_snvs)) : "null" -ch_cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ? Channel.value(file(params.cadd_wg_snvs_tbi)) : "null" -ch_pon = params.pon ? Channel.value(file(params.pon)) : "null" -ch_target_bed = params.target_bed ? Channel.value(file(params.target_bed)) : "null" +// // Initialize each params in params.genomes, catch the command line first if it was defined +// // params.fasta has to be the first one +// params.fasta = params.genome && !('annotate' in step) ? params.genomes[params.genome].fasta ?: null : null +// // The rest can be sorted +// params.ac_loci = params.genome && 'ascat' in tools ? params.genomes[params.genome].ac_loci ?: null : null +// params.ac_loci_gc = params.genome && 'ascat' in tools ? params.genomes[params.genome].ac_loci_gc ?: null : null +// params.bwa = params.genome && params.fasta && 'mapping' in step ? params.genomes[params.genome].bwa ?: null : null +// params.chr_dir = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].chr_dir ?: null : null +// params.chr_length = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].chr_length ?: null : null +// params.dbsnp = params.genome && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? params.genomes[params.genome].dbsnp ?: null : null +// params.dbsnp_index = params.genome && params.dbsnp ? params.genomes[params.genome].dbsnp_index ?: null : null +// params.dict = params.genome && params.fasta ? params.genomes[params.genome].dict ?: null : null +// params.fasta_fai = params.genome && params.fasta ? params.genomes[params.genome].fasta_fai ?: null : null +// params.germline_resource = params.genome && 'mutect2' in tools ? params.genomes[params.genome].germline_resource ?: null : null +// params.germline_resource_index = params.genome && params.germline_resource ? params.genomes[params.genome].germline_resource_index ?: null : null +// params.intervals = params.genome && !('annotate' in step) ? params.genomes[params.genome].intervals ?: null : null +// params.known_indels = params.genome && ('mapping' in step || 'preparerecalibration' in step) ? params.genomes[params.genome].known_indels ?: null : null +// params.known_indels_index = params.genome && params.known_indels ? params.genomes[params.genome].known_indels_index ?: null : null +// params.mappability = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].mappability ?: null : null +// params.snpeff_db = params.genome && 'snpeff' in tools ? params.genomes[params.genome].snpeff_db ?: null : null +// params.species = params.genome && 'vep' in tools ? params.genomes[params.genome].species ?: null : null +// params.vep_cache_version = params.genome && 'vep' in tools ? params.genomes[params.genome].vep_cache_version ?: null : null + +// // Initialize channels based on params +// ch_ac_loci = params.ac_loci && 'ascat' in tools ? Channel.value(file(params.ac_loci)) : "null" +// ch_ac_loci_gc = params.ac_loci_gc && 'ascat' in tools ? Channel.value(file(params.ac_loci_gc)) : "null" +// ch_chr_dir = params.chr_dir && 'controlfreec' in tools ? Channel.value(file(params.chr_dir)) : "null" +// ch_chr_length = params.chr_length && 'controlfreec' in tools ? Channel.value(file(params.chr_length)) : "null" +// ch_dbsnp = params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? Channel.value(file(params.dbsnp)) : "null" +// ch_fasta = params.fasta && !('annotate' in step) ? Channel.value(file(params.fasta)) : "null" +// ch_fai = params.fasta_fai && !('annotate' in step) ? Channel.value(file(params.fasta_fai)) : "null" +// ch_germline_resource = params.germline_resource && 'mutect2' in tools ? Channel.value(file(params.germline_resource)) : "null" +// ch_intervals = params.intervals && !params.no_intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : "null" +// ch_known_indels = params.known_indels && ('mapping' in step || 'preparerecalibration' in step) ? Channel.value(file(params.known_indels)) : "null" +// ch_mappability = params.mappability && 'controlfreec' in tools ? Channel.value(file(params.mappability)) : "null" + +// ch_snpeff_cache = params.snpeff_cache ? Channel.value(file(params.snpeff_cache)) : "null" +// ch_snpeff_db = params.snpeff_db ? Channel.value(params.snpeff_db) : "null" +// ch_vep_cache_version = params.vep_cache_version ? Channel.value(params.vep_cache_version) : "null" +// ch_vep_cache = params.vep_cache ? Channel.value(file(params.vep_cache)) : "null" + +// // Optional files, not defined within the params.genomes[params.genome] scope +// ch_cadd_indels = params.cadd_indels ? Channel.value(file(params.cadd_indels)) : "null" +// ch_cadd_indels_tbi = params.cadd_indels_tbi ? Channel.value(file(params.cadd_indels_tbi)) : "null" +// ch_cadd_wg_snvs = params.cadd_wg_snvs ? Channel.value(file(params.cadd_wg_snvs)) : "null" +// ch_cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ? Channel.value(file(params.cadd_wg_snvs_tbi)) : "null" +// ch_pon = params.pon ? Channel.value(file(params.pon)) : "null" +// ch_target_bed = params.target_bed ? Channel.value(file(params.target_bed)) : "null" /* ================================================================================ @@ -224,7 +240,7 @@ if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { run_name = workflow.runName } summary = Schema.params_summary(workflow, params, run_name) -log.info Headers.nf_core(workflow, params, run_name) +log.info Headers.nf_core(workflow, params.monochrome_logs) log.info summary.collect { k,v -> "${k.padRight(20)}: $v" }.join("\n") log.info "-\033[2m----------------------------------------------------\033[0m-" @@ -254,6 +270,47 @@ include { FASTQC } from './modules/nf-core/fastqc' params(params) include { MULTIQC } from './modules/nf-core/multiqc' params(params) +// // PREPARING CHANNELS FOR PREPROCESSING AND QC + +inputBam = Channel.empty() +inputPairReads = Channel.empty() + +if (step in ['preparerecalibration', 'recalibrate', 'variantcalling', 'controlfreec', 'annotate']) { + inputBam.close() + inputPairReads.close() +} else inputSample.choice(inputPairReads, inputBam) {hasExtension(it[3], "bam") ? 1 : 0} + +(inputBam, inputBamFastQC) = inputBam.into(2) + +// Removing inputFile2 which is null in case of uBAM +inputBamFastQC = inputBamFastQC.map { + idPatient, idSample, idRun, inputFile1, inputFile2 -> + [idPatient, idSample, idRun, inputFile1] +} + +if (params.split_fastq){ + inputPairReads = inputPairReads + // newly splitfastq are named based on split, so the name is easier to catch + .splitFastq(by: params.split_fastq, compress:true, file:"split", pe:true) + .map {idPatient, idSample, idRun, reads1, reads2 -> + // The split fastq read1 is the 4th element (indexed 3) its name is split_3 + // The split fastq read2's name is split_4 + // It's followed by which split it's acutally based on the mother fastq file + // Index start at 1 + // Extracting the index to get a new IdRun + splitIndex = reads1.fileName.toString().minus("split_3.").minus(".gz") + newIdRun = idRun + "_" + splitIndex + // Giving the files a new nice name + newReads1 = file("${idSample}_${newIdRun}_R1.fastq.gz") + newReads2 = file("${idSample}_${newIdRun}_R2.fastq.gz") + [idPatient, idSample, newIdRun, reads1, reads2]} +} + +inputPairReads = inputPairReads.dump(tag:'INPUT') + +(inputPairReads, inputPairReadsTrimGalore, inputPairReadsFastQC) = inputPairReads.into(3) + + /* ================================================================================ RUN THE WORKFLOW @@ -262,7 +319,7 @@ include { MULTIQC } from './modules/nf-core/multiqc' params(params) workflow { - CHECK_SAMPLESHEET(ch_input) + CHECK_SAMPLESHEET(inputSample) .splitCsv(header:true, sep:',') .map { check_samplesheet_paths(it) } .set { ch_raw_reads } @@ -612,45 +669,6 @@ workflow.onComplete { // (intBaseRecalibrator, intApplyBQSR, intHaplotypeCaller, intFreebayesSingle, intMpileup, bedIntervals) = bedIntervals.into(6) -// // PREPARING CHANNELS FOR PREPROCESSING AND QC - -// inputBam = Channel.create() -// inputPairReads = Channel.create() - -// if (step in ['preparerecalibration', 'recalibrate', 'variantcalling', 'controlfreec', 'annotate']) { -// inputBam.close() -// inputPairReads.close() -// } else inputSample.choice(inputPairReads, inputBam) {hasExtension(it[3], "bam") ? 1 : 0} - -// (inputBam, inputBamFastQC) = inputBam.into(2) - -// // Removing inputFile2 which is null in case of uBAM -// inputBamFastQC = inputBamFastQC.map { -// idPatient, idSample, idRun, inputFile1, inputFile2 -> -// [idPatient, idSample, idRun, inputFile1] -// } - -// if (params.split_fastq){ -// inputPairReads = inputPairReads -// // newly splitfastq are named based on split, so the name is easier to catch -// .splitFastq(by: params.split_fastq, compress:true, file:"split", pe:true) -// .map {idPatient, idSample, idRun, reads1, reads2 -> -// // The split fastq read1 is the 4th element (indexed 3) its name is split_3 -// // The split fastq read2's name is split_4 -// // It's followed by which split it's acutally based on the mother fastq file -// // Index start at 1 -// // Extracting the index to get a new IdRun -// splitIndex = reads1.fileName.toString().minus("split_3.").minus(".gz") -// newIdRun = idRun + "_" + splitIndex -// // Giving the files a new nice name -// newReads1 = file("${idSample}_${newIdRun}_R1.fastq.gz") -// newReads2 = file("${idSample}_${newIdRun}_R2.fastq.gz") -// [idPatient, idSample, newIdRun, reads1, reads2]} -// } - -// inputPairReads = inputPairReads.dump(tag:'INPUT') - -// (inputPairReads, inputPairReadsTrimGalore, inputPairReadsFastQC) = inputPairReads.into(3) // // STEP 0.5: QC ON READS From 9dee363284d03e8f4003c0258bfc19e2dd4198f0 Mon Sep 17 00:00:00 2001 From: ggabernet Date: Tue, 14 Jul 2020 16:38:32 +0200 Subject: [PATCH 007/200] Making Fastqc work Co-authored-by: FriederikeHanssen Co-authored-by: Maxime Garcia --- .vscode/settings.json | 1 + main.nf | 71 ++++++++++++++++++--------------------- modules/nf-core/fastqc.nf | 35 +++++++------------ 3 files changed, 47 insertions(+), 60 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000000..2cd8968be8 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1 @@ +123,125 \ No newline at end of file diff --git a/main.nf b/main.nf index 3987c09b4b..9be6997270 100644 --- a/main.nf +++ b/main.nf @@ -270,45 +270,45 @@ include { FASTQC } from './modules/nf-core/fastqc' params(params) include { MULTIQC } from './modules/nf-core/multiqc' params(params) -// // PREPARING CHANNELS FOR PREPROCESSING AND QC +// PREPARING CHANNELS FOR PREPROCESSING AND QC -inputBam = Channel.empty() -inputPairReads = Channel.empty() +// inputBam = Channel.empty() +// inputPairReads = Channel.empty() -if (step in ['preparerecalibration', 'recalibrate', 'variantcalling', 'controlfreec', 'annotate']) { - inputBam.close() - inputPairReads.close() -} else inputSample.choice(inputPairReads, inputBam) {hasExtension(it[3], "bam") ? 1 : 0} +// if (step in ['preparerecalibration', 'recalibrate', 'variantcalling', 'controlfreec', 'annotate']) { +// inputBam.close() +// inputPairReads.close() +// } else inputSample.branch(inputPairReads, inputBam) {hasExtension(it[3], "bam") ? 1 : 0} -(inputBam, inputBamFastQC) = inputBam.into(2) +// (inputBam, inputBamFastQC) = inputBam.into(2) -// Removing inputFile2 which is null in case of uBAM -inputBamFastQC = inputBamFastQC.map { - idPatient, idSample, idRun, inputFile1, inputFile2 -> - [idPatient, idSample, idRun, inputFile1] -} +// // Removing inputFile2 which is null in case of uBAM +// inputBamFastQC = inputBamFastQC.map { +// idPatient, idSample, idRun, inputFile1, inputFile2 -> +// [idPatient, idSample, idRun, inputFile1] +// } -if (params.split_fastq){ - inputPairReads = inputPairReads - // newly splitfastq are named based on split, so the name is easier to catch - .splitFastq(by: params.split_fastq, compress:true, file:"split", pe:true) - .map {idPatient, idSample, idRun, reads1, reads2 -> - // The split fastq read1 is the 4th element (indexed 3) its name is split_3 - // The split fastq read2's name is split_4 - // It's followed by which split it's acutally based on the mother fastq file - // Index start at 1 - // Extracting the index to get a new IdRun - splitIndex = reads1.fileName.toString().minus("split_3.").minus(".gz") - newIdRun = idRun + "_" + splitIndex - // Giving the files a new nice name - newReads1 = file("${idSample}_${newIdRun}_R1.fastq.gz") - newReads2 = file("${idSample}_${newIdRun}_R2.fastq.gz") - [idPatient, idSample, newIdRun, reads1, reads2]} -} +// if (params.split_fastq){ +// inputPairReads = inputPairReads +// // newly splitfastq are named based on split, so the name is easier to catch +// .splitFastq(by: params.split_fastq, compress:true, file:"split", pe:true) +// .map {idPatient, idSample, idRun, reads1, reads2 -> +// // The split fastq read1 is the 4th element (indexed 3) its name is split_3 +// // The split fastq read2's name is split_4 +// // It's followed by which split it's acutally based on the mother fastq file +// // Index start at 1 +// // Extracting the index to get a new IdRun +// splitIndex = reads1.fileName.toString().minus("split_3.").minus(".gz") +// newIdRun = idRun + "_" + splitIndex +// // Giving the files a new nice name +// newReads1 = file("${idSample}_${newIdRun}_R1.fastq.gz") +// newReads2 = file("${idSample}_${newIdRun}_R2.fastq.gz") +// [idPatient, idSample, newIdRun, reads1, reads2]} +//} -inputPairReads = inputPairReads.dump(tag:'INPUT') +// inputPairReads.dump(tag:'INPUT') -(inputPairReads, inputPairReadsTrimGalore, inputPairReadsFastQC) = inputPairReads.into(3) +// (inputPairReads, inputPairReadsTrimGalore, inputPairReadsFastQC) = inputPairReads.into(3) /* @@ -319,12 +319,7 @@ inputPairReads = inputPairReads.dump(tag:'INPUT') workflow { - CHECK_SAMPLESHEET(inputSample) - .splitCsv(header:true, sep:',') - .map { check_samplesheet_paths(it) } - .set { ch_raw_reads } - - FASTQC(ch_raw_reads) + FASTQC(inputSample) OUTPUT_DOCUMENTATION( ch_output_docs, diff --git a/modules/nf-core/fastqc.nf b/modules/nf-core/fastqc.nf index 5129ab4884..1aec743521 100644 --- a/modules/nf-core/fastqc.nf +++ b/modules/nf-core/fastqc.nf @@ -2,31 +2,22 @@ * FastQC */ process FASTQC { - tag "$name" - label 'process_medium' - publishDir "${params.outdir}/fastqc", mode: params.publish_dir_mode, - saveAs: { filename -> - filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename" - } + label 'FastQC' + label 'cpus_2' - input: - tuple val(name), val(single_end), path(reads) + tag "${idPatient}-${idRun}" + + publishDir "${params.outdir}/Reports/${idSample}/FastQC/${idSample}_${idRun}", mode: params.publish_dir_mode + input: + tuple val(idPatient), val(idSample), val(idRun), file("${idSample}_${idRun}_R1.fastq.gz"), file("${idSample}_${idRun}_R2.fastq.gz") + output: - path "*.{zip,html}" + path "*.{html,zip}" script: - // Add soft-links to original FastQs for consistent naming in pipeline - if (single_end) { - """ - [ ! -f ${name}.fastq.gz ] && ln -s $reads ${name}.fastq.gz - fastqc --quiet --threads $task.cpus ${name}.fastq.gz - """ - } else { - """ - [ ! -f ${name}_1.fastq.gz ] && ln -s ${reads[0]} ${name}_1.fastq.gz - [ ! -f ${name}_2.fastq.gz ] && ln -s ${reads[1]} ${name}_2.fastq.gz - fastqc --quiet --threads $task.cpus ${name}_1.fastq.gz ${name}_2.fastq.gz - """ - } + """ + fastqc -t 2 -q ${idSample}_${idRun}_R1.fastq.gz ${idSample}_${idRun}_R2.fastq.gz + """ } + From a49492eeda422a713e6e2e46b16a3681c3ceb790 Mon Sep 17 00:00:00 2001 From: ggabernet Date: Tue, 14 Jul 2020 16:49:35 +0200 Subject: [PATCH 008/200] update get_software_versions --- modules/local/get_software_versions.nf | 27 ++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/modules/local/get_software_versions.nf b/modules/local/get_software_versions.nf index 289234b0b2..5f7a385ff1 100644 --- a/modules/local/get_software_versions.nf +++ b/modules/local/get_software_versions.nf @@ -15,10 +15,29 @@ process GET_SOFTWARE_VERSIONS { script: // TODO nf-core: Get all tools to print their version number here """ - echo $workflow.manifest.version > v_pipeline.txt - echo $workflow.nextflow.version > v_nextflow.txt - fastqc --version > v_fastqc.txt - multiqc --version > v_multiqc.txt + alleleCounter --version &> v_allelecount.txt 2>&1 || true + bcftools --version &> v_bcftools.txt 2>&1 || true + bwa &> v_bwa.txt 2>&1 || true + cnvkit.py version &> v_cnvkit.txt 2>&1 || true + configManta.py --version &> v_manta.txt 2>&1 || true + configureStrelkaGermlineWorkflow.py --version &> v_strelka.txt 2>&1 || true + echo "${workflow.manifest.version}" &> v_pipeline.txt 2>&1 || true + echo "${workflow.nextflow.version}" &> v_nextflow.txt 2>&1 || true + snpEff -version &> v_snpeff.txt 2>&1 || true + fastqc --version &> v_fastqc.txt 2>&1 || true + freebayes --version &> v_freebayes.txt 2>&1 || true + freec &> v_controlfreec.txt 2>&1 || true + gatk ApplyBQSR --help &> v_gatk.txt 2>&1 || true + msisensor &> v_msisensor.txt 2>&1 || true + multiqc --version &> v_multiqc.txt 2>&1 || true + qualimap --version &> v_qualimap.txt 2>&1 || true + R --version &> v_r.txt 2>&1 || true + R -e "library(ASCAT); help(package='ASCAT')" &> v_ascat.txt 2>&1 || true + samtools --version &> v_samtools.txt 2>&1 || true + tiddit &> v_tiddit.txt 2>&1 || true + trim_galore -v &> v_trim_galore.txt 2>&1 || true + vcftools --version &> v_vcftools.txt 2>&1 || true + vep --help &> v_vep.txt 2>&1 || true scrape_software_versions.py &> software_versions_mqc.yaml """ } From 966ab2a0a275c66c618e86cdd8961d6d91ee826f Mon Sep 17 00:00:00 2001 From: ggabernet Date: Tue, 14 Jul 2020 16:55:27 +0200 Subject: [PATCH 009/200] Co-authored-by: FriederikeHanssen Co-authored-by: Maxime Garcia update get_software_versions --- main.nf | 50 ++------------------------------------------------ 1 file changed, 2 insertions(+), 48 deletions(-) diff --git a/main.nf b/main.nf index 9be6997270..0c28affd0e 100644 --- a/main.nf +++ b/main.nf @@ -170,7 +170,7 @@ if (tsvPath) { (genderMap, statusMap, inputSample) = extractInfos(inputSample) -inputSample.dump(tag: 'input sample') +// inputSample.dump(tag: 'input sample') /* ================================================================================ @@ -317,7 +317,7 @@ include { MULTIQC } from './modules/nf-core/multiqc' params(params) ================================================================================ */ -workflow { +workflow variant_calling{ FASTQC(inputSample) @@ -346,52 +346,6 @@ workflow.onComplete { Completion.summary(workflow, params, log) } -/* -================================================================================ - GET software versions -================================================================================ -*/ -// process Get_software_versions { -// publishDir path:"${params.outdir}/pipeline_info", mode: params.publish_dir_mode, -// saveAs: {it.indexOf(".csv") > 0 ? it : null} - -// output: -// file 'software_versions_mqc.yaml' into ch_software_versions_yaml -// file "software_versions.csv" - -// when: !('versions' in skipQC) - -// script: -// """ -// alleleCounter --version &> v_allelecount.txt 2>&1 || true -// bcftools --version &> v_bcftools.txt 2>&1 || true -// bwa &> v_bwa.txt 2>&1 || true -// cnvkit.py version &> v_cnvkit.txt 2>&1 || true -// configManta.py --version &> v_manta.txt 2>&1 || true -// configureStrelkaGermlineWorkflow.py --version &> v_strelka.txt 2>&1 || true -// echo "${workflow.manifest.version}" &> v_pipeline.txt 2>&1 || true -// echo "${workflow.nextflow.version}" &> v_nextflow.txt 2>&1 || true -// snpEff -version &> v_snpeff.txt 2>&1 || true -// fastqc --version &> v_fastqc.txt 2>&1 || true -// freebayes --version &> v_freebayes.txt 2>&1 || true -// freec &> v_controlfreec.txt 2>&1 || true -// gatk ApplyBQSR --help &> v_gatk.txt 2>&1 || true -// msisensor &> v_msisensor.txt 2>&1 || true -// multiqc --version &> v_multiqc.txt 2>&1 || true -// qualimap --version &> v_qualimap.txt 2>&1 || true -// R --version &> v_r.txt 2>&1 || true -// R -e "library(ASCAT); help(package='ASCAT')" &> v_ascat.txt 2>&1 || true -// samtools --version &> v_samtools.txt 2>&1 || true -// tiddit &> v_tiddit.txt 2>&1 || true -// trim_galore -v &> v_trim_galore.txt 2>&1 || true -// vcftools --version &> v_vcftools.txt 2>&1 || true -// vep --help &> v_vep.txt 2>&1 || true - -// scrape_software_versions.py &> software_versions_mqc.yaml -// """ -// } - -// ch_software_versions_yaml = ch_software_versions_yaml.dump(tag:'SOFTWARE VERSIONS') // /* // ================================================================================ From ab7ed2673b84f120248b5f924ecc34bdf11fd870 Mon Sep 17 00:00:00 2001 From: ggabernet Date: Tue, 14 Jul 2020 17:40:00 +0200 Subject: [PATCH 010/200] work on indices Co-authored-by: FriederikeHanssen Co-authored-by: Maxime Garcia --- main.nf | 97 +++++++++++++------------- modules/local/build_intervals.nf | 19 +++++ modules/local/buildindices.nf | 49 +++++++++++++ modules/local/check_samplesheet.nf | 34 --------- modules/local/gatk_dict.nf | 22 ++++++ modules/local/get_software_versions.nf | 1 - modules/local/mapreads.nf | 0 modules/nf-core/bwamem2_index.nf | 19 +++++ modules/nf-core/fastqc.nf | 2 + modules/nf-core/htslib_tabix.nf | 16 +++++ modules/nf-core/samtools_faidx.nf | 19 +++++ 11 files changed, 194 insertions(+), 84 deletions(-) create mode 100644 modules/local/build_intervals.nf create mode 100644 modules/local/buildindices.nf delete mode 100644 modules/local/check_samplesheet.nf create mode 100644 modules/local/gatk_dict.nf create mode 100644 modules/local/mapreads.nf create mode 100644 modules/nf-core/bwamem2_index.nf create mode 100644 modules/nf-core/htslib_tabix.nf create mode 100644 modules/nf-core/samtools_faidx.nf diff --git a/main.nf b/main.nf index 0c28affd0e..c4e9b23fb9 100644 --- a/main.nf +++ b/main.nf @@ -178,54 +178,54 @@ if (tsvPath) { ================================================================================ */ -// // Initialize each params in params.genomes, catch the command line first if it was defined -// // params.fasta has to be the first one -// params.fasta = params.genome && !('annotate' in step) ? params.genomes[params.genome].fasta ?: null : null -// // The rest can be sorted -// params.ac_loci = params.genome && 'ascat' in tools ? params.genomes[params.genome].ac_loci ?: null : null -// params.ac_loci_gc = params.genome && 'ascat' in tools ? params.genomes[params.genome].ac_loci_gc ?: null : null -// params.bwa = params.genome && params.fasta && 'mapping' in step ? params.genomes[params.genome].bwa ?: null : null -// params.chr_dir = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].chr_dir ?: null : null -// params.chr_length = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].chr_length ?: null : null -// params.dbsnp = params.genome && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? params.genomes[params.genome].dbsnp ?: null : null -// params.dbsnp_index = params.genome && params.dbsnp ? params.genomes[params.genome].dbsnp_index ?: null : null -// params.dict = params.genome && params.fasta ? params.genomes[params.genome].dict ?: null : null -// params.fasta_fai = params.genome && params.fasta ? params.genomes[params.genome].fasta_fai ?: null : null -// params.germline_resource = params.genome && 'mutect2' in tools ? params.genomes[params.genome].germline_resource ?: null : null -// params.germline_resource_index = params.genome && params.germline_resource ? params.genomes[params.genome].germline_resource_index ?: null : null -// params.intervals = params.genome && !('annotate' in step) ? params.genomes[params.genome].intervals ?: null : null -// params.known_indels = params.genome && ('mapping' in step || 'preparerecalibration' in step) ? params.genomes[params.genome].known_indels ?: null : null -// params.known_indels_index = params.genome && params.known_indels ? params.genomes[params.genome].known_indels_index ?: null : null -// params.mappability = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].mappability ?: null : null -// params.snpeff_db = params.genome && 'snpeff' in tools ? params.genomes[params.genome].snpeff_db ?: null : null -// params.species = params.genome && 'vep' in tools ? params.genomes[params.genome].species ?: null : null -// params.vep_cache_version = params.genome && 'vep' in tools ? params.genomes[params.genome].vep_cache_version ?: null : null - -// // Initialize channels based on params -// ch_ac_loci = params.ac_loci && 'ascat' in tools ? Channel.value(file(params.ac_loci)) : "null" -// ch_ac_loci_gc = params.ac_loci_gc && 'ascat' in tools ? Channel.value(file(params.ac_loci_gc)) : "null" -// ch_chr_dir = params.chr_dir && 'controlfreec' in tools ? Channel.value(file(params.chr_dir)) : "null" -// ch_chr_length = params.chr_length && 'controlfreec' in tools ? Channel.value(file(params.chr_length)) : "null" -// ch_dbsnp = params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? Channel.value(file(params.dbsnp)) : "null" -// ch_fasta = params.fasta && !('annotate' in step) ? Channel.value(file(params.fasta)) : "null" -// ch_fai = params.fasta_fai && !('annotate' in step) ? Channel.value(file(params.fasta_fai)) : "null" -// ch_germline_resource = params.germline_resource && 'mutect2' in tools ? Channel.value(file(params.germline_resource)) : "null" -// ch_intervals = params.intervals && !params.no_intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : "null" -// ch_known_indels = params.known_indels && ('mapping' in step || 'preparerecalibration' in step) ? Channel.value(file(params.known_indels)) : "null" -// ch_mappability = params.mappability && 'controlfreec' in tools ? Channel.value(file(params.mappability)) : "null" - -// ch_snpeff_cache = params.snpeff_cache ? Channel.value(file(params.snpeff_cache)) : "null" -// ch_snpeff_db = params.snpeff_db ? Channel.value(params.snpeff_db) : "null" -// ch_vep_cache_version = params.vep_cache_version ? Channel.value(params.vep_cache_version) : "null" -// ch_vep_cache = params.vep_cache ? Channel.value(file(params.vep_cache)) : "null" - -// // Optional files, not defined within the params.genomes[params.genome] scope -// ch_cadd_indels = params.cadd_indels ? Channel.value(file(params.cadd_indels)) : "null" -// ch_cadd_indels_tbi = params.cadd_indels_tbi ? Channel.value(file(params.cadd_indels_tbi)) : "null" -// ch_cadd_wg_snvs = params.cadd_wg_snvs ? Channel.value(file(params.cadd_wg_snvs)) : "null" -// ch_cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ? Channel.value(file(params.cadd_wg_snvs_tbi)) : "null" -// ch_pon = params.pon ? Channel.value(file(params.pon)) : "null" -// ch_target_bed = params.target_bed ? Channel.value(file(params.target_bed)) : "null" +// Initialize each params in params.genomes, catch the command line first if it was defined +// params.fasta has to be the first one +params.fasta = params.genome && !('annotate' in step) ? params.genomes[params.genome].fasta ?: null : null +// The rest can be sorted +params.ac_loci = params.genome && 'ascat' in tools ? params.genomes[params.genome].ac_loci ?: null : null +params.ac_loci_gc = params.genome && 'ascat' in tools ? params.genomes[params.genome].ac_loci_gc ?: null : null +params.bwa = params.genome && params.fasta && 'mapping' in step ? params.genomes[params.genome].bwa ?: null : null +params.chr_dir = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].chr_dir ?: null : null +params.chr_length = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].chr_length ?: null : null +params.dbsnp = params.genome && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? params.genomes[params.genome].dbsnp ?: null : null +params.dbsnp_index = params.genome && params.dbsnp ? params.genomes[params.genome].dbsnp_index ?: null : null +params.dict = params.genome && params.fasta ? params.genomes[params.genome].dict ?: null : null +params.fasta_fai = params.genome && params.fasta ? params.genomes[params.genome].fasta_fai ?: null : null +params.germline_resource = params.genome && 'mutect2' in tools ? params.genomes[params.genome].germline_resource ?: null : null +params.germline_resource_index = params.genome && params.germline_resource ? params.genomes[params.genome].germline_resource_index ?: null : null +params.intervals = params.genome && !('annotate' in step) ? params.genomes[params.genome].intervals ?: null : null +params.known_indels = params.genome && ('mapping' in step || 'preparerecalibration' in step) ? params.genomes[params.genome].known_indels ?: null : null +params.known_indels_index = params.genome && params.known_indels ? params.genomes[params.genome].known_indels_index ?: null : null +params.mappability = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].mappability ?: null : null +params.snpeff_db = params.genome && 'snpeff' in tools ? params.genomes[params.genome].snpeff_db ?: null : null +params.species = params.genome && 'vep' in tools ? params.genomes[params.genome].species ?: null : null +params.vep_cache_version = params.genome && 'vep' in tools ? params.genomes[params.genome].vep_cache_version ?: null : null + +// Initialize channels based on params +ch_ac_loci = params.ac_loci && 'ascat' in tools ? Channel.value(file(params.ac_loci)) : "null" +ch_ac_loci_gc = params.ac_loci_gc && 'ascat' in tools ? Channel.value(file(params.ac_loci_gc)) : "null" +ch_chr_dir = params.chr_dir && 'controlfreec' in tools ? Channel.value(file(params.chr_dir)) : "null" +ch_chr_length = params.chr_length && 'controlfreec' in tools ? Channel.value(file(params.chr_length)) : "null" +ch_dbsnp = params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? Channel.value(file(params.dbsnp)) : "null" +ch_fasta = params.fasta && !('annotate' in step) ? Channel.value(file(params.fasta)) : "null" +ch_fai = params.fasta_fai && !('annotate' in step) ? Channel.value(file(params.fasta_fai)) : "null" +ch_germline_resource = params.germline_resource && 'mutect2' in tools ? Channel.value(file(params.germline_resource)) : "null" +ch_intervals = params.intervals && !params.no_intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : "null" +ch_known_indels = params.known_indels && ('mapping' in step || 'preparerecalibration' in step) ? Channel.value(file(params.known_indels)) : "null" +ch_mappability = params.mappability && 'controlfreec' in tools ? Channel.value(file(params.mappability)) : "null" + +ch_snpeff_cache = params.snpeff_cache ? Channel.value(file(params.snpeff_cache)) : "null" +ch_snpeff_db = params.snpeff_db ? Channel.value(params.snpeff_db) : "null" +ch_vep_cache_version = params.vep_cache_version ? Channel.value(params.vep_cache_version) : "null" +ch_vep_cache = params.vep_cache ? Channel.value(file(params.vep_cache)) : "null" + +// Optional files, not defined within the params.genomes[params.genome] scope +ch_cadd_indels = params.cadd_indels ? Channel.value(file(params.cadd_indels)) : "null" +ch_cadd_indels_tbi = params.cadd_indels_tbi ? Channel.value(file(params.cadd_indels_tbi)) : "null" +ch_cadd_wg_snvs = params.cadd_wg_snvs ? Channel.value(file(params.cadd_wg_snvs)) : "null" +ch_cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ? Channel.value(file(params.cadd_wg_snvs_tbi)) : "null" +ch_pon = params.pon ? Channel.value(file(params.pon)) : "null" +ch_target_bed = params.target_bed ? Channel.value(file(params.target_bed)) : "null" /* ================================================================================ @@ -259,7 +259,6 @@ if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works */ include { OUTPUT_DOCUMENTATION } from './modules/local/output_documentation' params(params) include { GET_SOFTWARE_VERSIONS } from './modules/local/get_software_versions' params(params) -include { CHECK_SAMPLESHEET; check_samplesheet_paths } from './modules/local/check_samplesheet' params(params) /* ================================================================================ diff --git a/modules/local/build_intervals.nf b/modules/local/build_intervals.nf new file mode 100644 index 0000000000..d235885335 --- /dev/null +++ b/modules/local/build_intervals.nf @@ -0,0 +1,19 @@ +process BUILD_INTERVALS { + tag "${fastaFai}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: {params.save_reference ? "reference_genome/${it}" : null } + + input: + path file(fastaFai) + + output: + path file("${fastaFai.baseName}.bed") + + //when: !(params.intervals) && !('annotate' in step) && !('controlfreec' in step) + + script: + """ + awk -v FS='\t' -v OFS='\t' '{ print \$1, \"0\", \$2 }' ${fastaFai} > ${fastaFai.baseName}.bed + """ +} \ No newline at end of file diff --git a/modules/local/buildindices.nf b/modules/local/buildindices.nf new file mode 100644 index 0000000000..b4dd390be2 --- /dev/null +++ b/modules/local/buildindices.nf @@ -0,0 +1,49 @@ +/* +================================================================================ + BUILDING INDEXES +================================================================================ +*/ + +// And then initialize channels based on params or indexes that were just built + + +include HTSLIB_TABIX as HTSLIB_TABIX_DBSNP from '../../nf-core/htslib_tabix' +include HTSLIB_TABIX as HTSLIB_TABIX_GERMLINE_RESOURCE from '../../nf-core/htslib_tabix' +include HTSLIB_TABIX as HTSLIB_TABIX_KNOWN_INDELS from '../../nf-core/htslib_tabix' +include HTSLIB_TABIX as HTSLIB_TABIX_PON from '../../nf-core/htslib_tabix' + +ch_known_indels_tbi = params.known_indels ? params.known_indels_index ? Channel.value(file(params.known_indels_index)) : known_indels_tbi.collect() : "null" + + +ch_pon_tbi = params.pon ? params.pon_index ? Channel.value(file(params.pon_index)) : pon_tbi : "null" + + + +ch_intervals = params.no_intervals ? "null" : params.intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : intervalBuilt + +workflow build_indices{ + + BWAMEM2_INDEX(ch_fasta) + + GATK_CREATE_SEQUENCE_DICTIONARY(ch_fasta) + + SAMTOOLS_FAIDX(ch_fasta) + + HTSLIB_TABIX_DBSNP(ch_dbsnp) //ch_dbsnp + + HTSLIB_TABIX_GERMLINE_RESOURCE(ch_germline_resource) //ch_germline_resource + + HTSLIB_TABIX_KNOWN_INDELS(ch_known_indels) //ch_knwon_indels + +} + +ch_bwa = params.bwa ? Channel.value(file(params.bwa)) : BWAMEM2_INDEX.out + +ch_dict = params.dict ? Channel.value(file(params.dict)) : GATK_CREATE_SEQUENCE_DICTIONARY.out + +ch_fai = params.fasta_fai ? Channel.value(file(params.fasta_fai)) : SAMTOOLS_FAIDX.out + +ch_dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? Channel.value(file(params.dbsnp_index)) : dbsnp_tbi : "null" + +ch_germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? Channel.value(file(params.germline_resource_index)) : germline_resource_tbi : "null" + diff --git a/modules/local/check_samplesheet.nf b/modules/local/check_samplesheet.nf deleted file mode 100644 index 42de176519..0000000000 --- a/modules/local/check_samplesheet.nf +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Reformat input samplesheet and check validity - */ -process CHECK_SAMPLESHEET { - tag "$samplesheet" - publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode - - input: - path samplesheet - - output: - path '*.csv' - - script: // This script is bundled with the pipeline, in nf-core/tcrseq/bin/ - """ - check_samplesheet.py $samplesheet samplesheet.valid.csv - """ -} - -// Function to get list of [ sample, single_end?, [ fastq_1, fastq_2 ] ] -def check_samplesheet_paths(LinkedHashMap samplesheet) { - def sample = samplesheet.sample - def single_end = samplesheet.single_end.toBoolean() - def fastq_1 = samplesheet.fastq_1 - def fastq_2 = samplesheet.fastq_2 - - def array = [] - if (single_end) { - array = [ sample, single_end, [ file(fastq_1, checkIfExists: true) ] ] - } else { - array = [ sample, single_end, [ file(fastq_1, checkIfExists: true), file(fastq_2, checkIfExists: true) ] ] - } - return array -} diff --git a/modules/local/gatk_dict.nf b/modules/local/gatk_dict.nf new file mode 100644 index 0000000000..381ee4dcae --- /dev/null +++ b/modules/local/gatk_dict.nf @@ -0,0 +1,22 @@ +process GATK_CREATE_SEQUENCE_DICTIONARY { + tag "${fasta}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: {params.save_reference ? "reference_genome/${it}" : null } + + input: + path file(fasta) + + output: + path file("${fasta.baseName}.dict") + + //when: !(params.dict) && params.fasta && !('annotate' in step) && !('controlfreec' in step) + + script: + """ + gatk --java-options "-Xmx${task.memory.toGiga()}g" \ + CreateSequenceDictionary \ + --REFERENCE ${fasta} \ + --OUTPUT ${fasta.baseName}.dict + """ +} \ No newline at end of file diff --git a/modules/local/get_software_versions.nf b/modules/local/get_software_versions.nf index 5f7a385ff1..5d29cf8319 100644 --- a/modules/local/get_software_versions.nf +++ b/modules/local/get_software_versions.nf @@ -13,7 +13,6 @@ process GET_SOFTWARE_VERSIONS { path "software_versions.csv", emit: software_versions_csv script: - // TODO nf-core: Get all tools to print their version number here """ alleleCounter --version &> v_allelecount.txt 2>&1 || true bcftools --version &> v_bcftools.txt 2>&1 || true diff --git a/modules/local/mapreads.nf b/modules/local/mapreads.nf new file mode 100644 index 0000000000..e69de29bb2 diff --git a/modules/nf-core/bwamem2_index.nf b/modules/nf-core/bwamem2_index.nf new file mode 100644 index 0000000000..d039043890 --- /dev/null +++ b/modules/nf-core/bwamem2_index.nf @@ -0,0 +1,19 @@ +process BWAMEM2_INDEX { + tag "${fasta}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: {params.save_reference ? "reference_genome/BWAIndex/${it}" : null } + + input: + path file(fasta) + + output: + path file("${fasta}.*") + + //when: !(params.bwa) && params.fasta && 'mapping' in step + + script: + """ + bwa-mem2 index ${fasta} + """ +} diff --git a/modules/nf-core/fastqc.nf b/modules/nf-core/fastqc.nf index 1aec743521..d152faf568 100644 --- a/modules/nf-core/fastqc.nf +++ b/modules/nf-core/fastqc.nf @@ -15,6 +15,8 @@ process FASTQC { output: path "*.{html,zip}" + // when: !('fastqc' in skipQC) + script: """ fastqc -t 2 -q ${idSample}_${idRun}_R1.fastq.gz ${idSample}_${idRun}_R2.fastq.gz diff --git a/modules/nf-core/htslib_tabix.nf b/modules/nf-core/htslib_tabix.nf new file mode 100644 index 0000000000..85dca847e2 --- /dev/null +++ b/modules/nf-core/htslib_tabix.nf @@ -0,0 +1,16 @@ +process htslib_tabix { + tag {vcf} + + container 'quay.io/biocontainers/tabix:0.2.6--ha92aebf_0' + + input: + path(vcf) + + output: + path("${vcf}.tbi") + + script: + """ + tabix -p vcf ${vcf} + """ +} \ No newline at end of file diff --git a/modules/nf-core/samtools_faidx.nf b/modules/nf-core/samtools_faidx.nf new file mode 100644 index 0000000000..72d6d6f7c9 --- /dev/null +++ b/modules/nf-core/samtools_faidx.nf @@ -0,0 +1,19 @@ +process SAMTOOLS_FAIDX { + tag "${fasta}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: {params.save_reference ? "reference_genome/${it}" : null } + + input: + path file(fasta) + + output: + path file("${fasta}.fai") + + //when: !(params.fasta_fai) && params.fasta && !('annotate' in step) + + script: + """ + samtools faidx ${fasta} + """ +} From 0cfffa6bbda59419e3a0745a0ba3543f236c580a Mon Sep 17 00:00:00 2001 From: ggabernet Date: Tue, 14 Jul 2020 18:19:37 +0200 Subject: [PATCH 011/200] buildindices subworkflow Co-authored-by: FriederikeHanssen --- conf/test.config | 5 +--- main.nf | 21 ++++++++++++++- modules/local/buildindices.nf | 45 +++++++++++++-------------------- modules/nf-core/htslib_tabix.nf | 2 +- 4 files changed, 40 insertions(+), 33 deletions(-) diff --git a/conf/test.config b/conf/test.config index 685a29af7e..bbbf1a0f94 100644 --- a/conf/test.config +++ b/conf/test.config @@ -18,8 +18,7 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/sarek/testdata/tsv/tiny-manta-https.tsv' -/* - * TODO: uncomment when ready + // Small reference genome igenomes_ignore = true genome = 'smallGRCh37' @@ -27,8 +26,6 @@ params { snpeff_db = 'WBcel235.86' species = 'caenorhabditis_elegans' vep_cache_version = '99' -*/ - } /* diff --git a/main.nf b/main.nf index c4e9b23fb9..84849af5bb 100644 --- a/main.nf +++ b/main.nf @@ -315,9 +315,12 @@ include { MULTIQC } from './modules/nf-core/multiqc' params(params) RUN THE WORKFLOW ================================================================================ */ +include { build_indices } from './modules/local/buildindices' -workflow variant_calling{ +workflow { + + //build_indices(ch_fasta, ch_dbsnp, ch_germline_resource, ch_known_indels) FASTQC(inputSample) OUTPUT_DOCUMENTATION( @@ -334,6 +337,22 @@ workflow variant_calling{ ch_workflow_summary) } +ch_bwa = params.bwa ? Channel.value(file(params.bwa)) : build_indices.out.BWAMEM2_INDEX.out + +ch_dict = params.dict ? Channel.value(file(params.dict)) : GATK_CREATE_SEQUENCE_DICTIONARY.out + +ch_fai = params.fasta_fai ? Channel.value(file(params.fasta_fai)) : SAMTOOLS_FAIDX.out + +ch_dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? Channel.value(file(params.dbsnp_index)) : dbsnp_tbi : "null" + +ch_germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? Channel.value(file(params.germline_resource_index)) : germline_resource_tbi : "null" + +ch_known_indels_tbi = params.known_indels ? params.known_indels_index ? Channel.value(file(params.known_indels_index)) : known_indels_tbi.collect() : "null" + +ch_pon_tbi = params.pon ? params.pon_index ? Channel.value(file(params.pon_index)) : pon_tbi : "null" + +ch_intervals = params.no_intervals ? "null" : params.intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : intervalBuilt + /* ================================================================================ SEND COMPLETION EMAIL diff --git a/modules/local/buildindices.nf b/modules/local/buildindices.nf index b4dd390be2..9b4dbff2ed 100644 --- a/modules/local/buildindices.nf +++ b/modules/local/buildindices.nf @@ -7,43 +7,34 @@ // And then initialize channels based on params or indexes that were just built -include HTSLIB_TABIX as HTSLIB_TABIX_DBSNP from '../../nf-core/htslib_tabix' -include HTSLIB_TABIX as HTSLIB_TABIX_GERMLINE_RESOURCE from '../../nf-core/htslib_tabix' -include HTSLIB_TABIX as HTSLIB_TABIX_KNOWN_INDELS from '../../nf-core/htslib_tabix' -include HTSLIB_TABIX as HTSLIB_TABIX_PON from '../../nf-core/htslib_tabix' - -ch_known_indels_tbi = params.known_indels ? params.known_indels_index ? Channel.value(file(params.known_indels_index)) : known_indels_tbi.collect() : "null" - - -ch_pon_tbi = params.pon ? params.pon_index ? Channel.value(file(params.pon_index)) : pon_tbi : "null" - - - -ch_intervals = params.no_intervals ? "null" : params.intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : intervalBuilt +include { HTSLIB_TABIX as HTSLIB_TABIX_DBSNP } from '../nf-core/htslib_tabix' +include { HTSLIB_TABIX as HTSLIB_TABIX_GERMLINE_RESOURCE } from '../nf-core/htslib_tabix' +include { HTSLIB_TABIX as HTSLIB_TABIX_KNOWN_INDELS } from '../nf-core/htslib_tabix' +include { HTSLIB_TABIX as HTSLIB_TABIX_PON } from '../nf-core/htslib_tabix' workflow build_indices{ + take: + ch_fasta + ch_dbsnp + ch_germline_resource + ch_known_indels + main: BWAMEM2_INDEX(ch_fasta) - GATK_CREATE_SEQUENCE_DICTIONARY(ch_fasta) - SAMTOOLS_FAIDX(ch_fasta) - HTSLIB_TABIX_DBSNP(ch_dbsnp) //ch_dbsnp - HTSLIB_TABIX_GERMLINE_RESOURCE(ch_germline_resource) //ch_germline_resource - HTSLIB_TABIX_KNOWN_INDELS(ch_known_indels) //ch_knwon_indels -} - -ch_bwa = params.bwa ? Channel.value(file(params.bwa)) : BWAMEM2_INDEX.out - -ch_dict = params.dict ? Channel.value(file(params.dict)) : GATK_CREATE_SEQUENCE_DICTIONARY.out + emit: + bwamem2_index = BWAMEM2_INDEX.out + gatk_dict = GATK_CREATE_SEQUENCE_DICTIONARY.out + samtools_faidx = SAMTOOLS_FAIDX.out + tabix_dbsnp = HTSLIB_TABIX_DBSNP.out + tabix_germline = HTSLIB_TABIX_GERMLINE_RESOURCE.out + tabix_indels = HTSLIB_TABIX_KNOWN_INDELS.out -ch_fai = params.fasta_fai ? Channel.value(file(params.fasta_fai)) : SAMTOOLS_FAIDX.out - -ch_dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? Channel.value(file(params.dbsnp_index)) : dbsnp_tbi : "null" +} -ch_germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? Channel.value(file(params.germline_resource_index)) : germline_resource_tbi : "null" diff --git a/modules/nf-core/htslib_tabix.nf b/modules/nf-core/htslib_tabix.nf index 85dca847e2..146ccb664f 100644 --- a/modules/nf-core/htslib_tabix.nf +++ b/modules/nf-core/htslib_tabix.nf @@ -1,4 +1,4 @@ -process htslib_tabix { +process HTSLIB_TABIX { tag {vcf} container 'quay.io/biocontainers/tabix:0.2.6--ha92aebf_0' From e87c4ddaf16786732f76640296d63ba2c23a1461 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Wed, 15 Jul 2020 11:10:18 +0200 Subject: [PATCH 012/200] bwamem2 is not working --- main.nf | 21 +++++++++++---------- modules/local/buildindices.nf | 2 +- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/main.nf b/main.nf index 22c2086cf7..40c09029f8 100644 --- a/main.nf +++ b/main.nf @@ -313,12 +313,13 @@ include { MULTIQC } from './modules/nf-core/multiqc' params(params) RUN THE WORKFLOW ================================================================================ */ -include { build_indices } from './modules/local/buildindices' +include { BUILD_INDICES } from './modules/local/buildindices' workflow { - //build_indices(ch_fasta, ch_dbsnp, ch_germline_resource, ch_known_indels) + //BUILD_INDICES(ch_fasta, ch_dbsnp, ch_germline_resource, ch_known_indels) + FASTQC(inputSample) OUTPUT_DOCUMENTATION( @@ -335,21 +336,21 @@ workflow { ch_workflow_summary) } -ch_bwa = params.bwa ? Channel.value(file(params.bwa)) : build_indices.out.BWAMEM2_INDEX.out +// ch_bwa = params.bwa ? Channel.value(file(params.bwa)) : build_indices.out.BWAMEM2_INDEX.out -ch_dict = params.dict ? Channel.value(file(params.dict)) : GATK_CREATE_SEQUENCE_DICTIONARY.out +// ch_dict = params.dict ? Channel.value(file(params.dict)) : GATK_CREATE_SEQUENCE_DICTIONARY.out -ch_fai = params.fasta_fai ? Channel.value(file(params.fasta_fai)) : SAMTOOLS_FAIDX.out +// ch_fai = params.fasta_fai ? Channel.value(file(params.fasta_fai)) : SAMTOOLS_FAIDX.out -ch_dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? Channel.value(file(params.dbsnp_index)) : dbsnp_tbi : "null" +// ch_dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? Channel.value(file(params.dbsnp_index)) : dbsnp_tbi : "null" -ch_germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? Channel.value(file(params.germline_resource_index)) : germline_resource_tbi : "null" +// ch_germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? Channel.value(file(params.germline_resource_index)) : germline_resource_tbi : "null" -ch_known_indels_tbi = params.known_indels ? params.known_indels_index ? Channel.value(file(params.known_indels_index)) : known_indels_tbi.collect() : "null" +// ch_known_indels_tbi = params.known_indels ? params.known_indels_index ? Channel.value(file(params.known_indels_index)) : known_indels_tbi.collect() : "null" -ch_pon_tbi = params.pon ? params.pon_index ? Channel.value(file(params.pon_index)) : pon_tbi : "null" +// ch_pon_tbi = params.pon ? params.pon_index ? Channel.value(file(params.pon_index)) : pon_tbi : "null" -ch_intervals = params.no_intervals ? "null" : params.intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : intervalBuilt +// ch_intervals = params.no_intervals ? "null" : params.intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : intervalBuilt /* ================================================================================ diff --git a/modules/local/buildindices.nf b/modules/local/buildindices.nf index 9b4dbff2ed..f34c0a9fc8 100644 --- a/modules/local/buildindices.nf +++ b/modules/local/buildindices.nf @@ -12,7 +12,7 @@ include { HTSLIB_TABIX as HTSLIB_TABIX_GERMLINE_RESOURCE } from '../nf-core/htsl include { HTSLIB_TABIX as HTSLIB_TABIX_KNOWN_INDELS } from '../nf-core/htslib_tabix' include { HTSLIB_TABIX as HTSLIB_TABIX_PON } from '../nf-core/htslib_tabix' -workflow build_indices{ +workflow BUILD_INDICES{ take: ch_fasta ch_dbsnp From 97e07befdfa084dc5fdafdf5b4f0f9c5a3f9c48c Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Wed, 15 Jul 2020 14:47:09 +0200 Subject: [PATCH 013/200] Add index subworkflow with bwamem2, samtools, gatk, htslib --- main.nf | 18 ++++++++++-------- modules/local/buildindices.nf | 28 ++++++++++++++++++++++------ modules/local/gatk_dict.nf | 2 -- modules/nf-core/bwamem2_index.nf | 2 -- 4 files changed, 32 insertions(+), 18 deletions(-) diff --git a/main.nf b/main.nf index 40c09029f8..a87f71001f 100644 --- a/main.nf +++ b/main.nf @@ -179,8 +179,7 @@ if (tsvPath) { */ // Initialize each params in params.genomes, catch the command line first if it was defined -// params.fasta has to be the first one -params.fasta = params.genome && !('annotate' in step) ? params.genomes[params.genome].fasta ?: null : null + // The rest can be sorted params.ac_loci = params.genome && 'ascat' in tools ? params.genomes[params.genome].ac_loci ?: null : null params.ac_loci_gc = params.genome && 'ascat' in tools ? params.genomes[params.genome].ac_loci_gc ?: null : null @@ -206,12 +205,8 @@ ch_ac_loci = params.ac_loci && 'ascat' in tools ? Channel.value(file(params.ac_l ch_ac_loci_gc = params.ac_loci_gc && 'ascat' in tools ? Channel.value(file(params.ac_loci_gc)) : "null" ch_chr_dir = params.chr_dir && 'controlfreec' in tools ? Channel.value(file(params.chr_dir)) : "null" ch_chr_length = params.chr_length && 'controlfreec' in tools ? Channel.value(file(params.chr_length)) : "null" -ch_dbsnp = params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? Channel.value(file(params.dbsnp)) : "null" -ch_fasta = params.fasta && !('annotate' in step) ? Channel.value(file(params.fasta)) : "null" ch_fai = params.fasta_fai && !('annotate' in step) ? Channel.value(file(params.fasta_fai)) : "null" -ch_germline_resource = params.germline_resource && 'mutect2' in tools ? Channel.value(file(params.germline_resource)) : "null" ch_intervals = params.intervals && !params.no_intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : "null" -ch_known_indels = params.known_indels && ('mapping' in step || 'preparerecalibration' in step) ? Channel.value(file(params.known_indels)) : "null" ch_mappability = params.mappability && 'controlfreec' in tools ? Channel.value(file(params.mappability)) : "null" ch_snpeff_cache = params.snpeff_cache ? Channel.value(file(params.snpeff_cache)) : "null" @@ -313,12 +308,19 @@ include { MULTIQC } from './modules/nf-core/multiqc' params(params) RUN THE WORKFLOW ================================================================================ */ -include { BUILD_INDICES } from './modules/local/buildindices' +include { BUILD_INDICES } from './modules/local/buildindices' addParams(params) + +// params.fasta has to be the first one +params.fasta = params.genome && !('annotate' in step) ? params.genomes[params.genome].fasta ?: null : null +ch_fasta = params.fasta && !('annotate' in step) ? Channel.value(file(params.fasta)) : "null" +ch_dbsnp = params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? Channel.value(file(params.dbsnp)) : "null" +ch_germline_resource = params.germline_resource && 'mutect2' in tools ? Channel.value(file(params.germline_resource)) : "null" +ch_known_indels = params.known_indels && ('mapping' in step || 'preparerecalibration' in step) ? Channel.value(file(params.known_indels)) : "null" workflow { - //BUILD_INDICES(ch_fasta, ch_dbsnp, ch_germline_resource, ch_known_indels) + BUILD_INDICES(step, ch_fasta, ch_dbsnp, ch_germline_resource, ch_known_indels) FASTQC(inputSample) diff --git a/modules/local/buildindices.nf b/modules/local/buildindices.nf index f34c0a9fc8..909dcd48ba 100644 --- a/modules/local/buildindices.nf +++ b/modules/local/buildindices.nf @@ -11,21 +11,37 @@ include { HTSLIB_TABIX as HTSLIB_TABIX_DBSNP } from '../nf-core/htslib_tabix' include { HTSLIB_TABIX as HTSLIB_TABIX_GERMLINE_RESOURCE } from '../nf-core/htslib_tabix' include { HTSLIB_TABIX as HTSLIB_TABIX_KNOWN_INDELS } from '../nf-core/htslib_tabix' include { HTSLIB_TABIX as HTSLIB_TABIX_PON } from '../nf-core/htslib_tabix' +include { BWAMEM2_INDEX as BWAMEM2_INDEX } from '../nf-core/bwamem2_index.nf' +include { SAMTOOLS_FAIDX as SAMTOOLS_FAIDX } from '../nf-core/samtools_faidx.nf' +include { GATK_CREATE_SEQUENCE_DICTIONARY as GATK_CREATE_SEQUENCE_DICTIONARY } from './gatk_dict.nf' workflow BUILD_INDICES{ take: + step ch_fasta ch_dbsnp ch_germline_resource ch_known_indels main: - BWAMEM2_INDEX(ch_fasta) - GATK_CREATE_SEQUENCE_DICTIONARY(ch_fasta) - SAMTOOLS_FAIDX(ch_fasta) - HTSLIB_TABIX_DBSNP(ch_dbsnp) //ch_dbsnp - HTSLIB_TABIX_GERMLINE_RESOURCE(ch_germline_resource) //ch_germline_resource - HTSLIB_TABIX_KNOWN_INDELS(ch_known_indels) //ch_knwon_indels + + if(!(params.bwa) && params.fasta && 'mapping' in step) + BWAMEM2_INDEX(ch_fasta) + + if(!(params.dict) && params.fasta && !('annotate' in step) && !('controlfreec' in step)) + ATK_CREATE_SEQUENCE_DICTIONARY(ch_fasta) + + if(!(params.fasta_fai) && params.fasta && !('annotate' in step)) + SAMTOOLS_FAIDX(ch_fasta) + + if(!(params.dbsnp_index) && params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || 'tnscope' in tools)) + HTSLIB_TABIX_DBSNP(ch_dbsnp) + + if(!(params.germline_resource_index) && params.germline_resource && 'mutect2' in tools) + HTSLIB_TABIX_GERMLINE_RESOURCE(ch_germline_resource) + + if(!(params.known_indels_index) && params.known_indels && ('mapping' in step || 'preparerecalibration' in step)) + HTSLIB_TABIX_KNOWN_INDELS(ch_known_indels) emit: bwamem2_index = BWAMEM2_INDEX.out diff --git a/modules/local/gatk_dict.nf b/modules/local/gatk_dict.nf index 381ee4dcae..d255da7438 100644 --- a/modules/local/gatk_dict.nf +++ b/modules/local/gatk_dict.nf @@ -10,8 +10,6 @@ process GATK_CREATE_SEQUENCE_DICTIONARY { output: path file("${fasta.baseName}.dict") - //when: !(params.dict) && params.fasta && !('annotate' in step) && !('controlfreec' in step) - script: """ gatk --java-options "-Xmx${task.memory.toGiga()}g" \ diff --git a/modules/nf-core/bwamem2_index.nf b/modules/nf-core/bwamem2_index.nf index d039043890..ccd628282c 100644 --- a/modules/nf-core/bwamem2_index.nf +++ b/modules/nf-core/bwamem2_index.nf @@ -10,8 +10,6 @@ process BWAMEM2_INDEX { output: path file("${fasta}.*") - //when: !(params.bwa) && params.fasta && 'mapping' in step - script: """ bwa-mem2 index ${fasta} From da2b677c806d9efb4b0e2017d440a43661b81cee Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Wed, 15 Jul 2020 15:11:23 +0200 Subject: [PATCH 014/200] Add intervals building --- modules/local/buildindices.nf | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/modules/local/buildindices.nf b/modules/local/buildindices.nf index 909dcd48ba..7c423f295c 100644 --- a/modules/local/buildindices.nf +++ b/modules/local/buildindices.nf @@ -14,7 +14,7 @@ include { HTSLIB_TABIX as HTSLIB_TABIX_PON } from '../nf-core/htslib_tabix' include { BWAMEM2_INDEX as BWAMEM2_INDEX } from '../nf-core/bwamem2_index.nf' include { SAMTOOLS_FAIDX as SAMTOOLS_FAIDX } from '../nf-core/samtools_faidx.nf' include { GATK_CREATE_SEQUENCE_DICTIONARY as GATK_CREATE_SEQUENCE_DICTIONARY } from './gatk_dict.nf' - +include { BUILD_INTERVALS } from './build_intervals.nf' workflow BUILD_INDICES{ take: step @@ -22,6 +22,7 @@ workflow BUILD_INDICES{ ch_dbsnp ch_germline_resource ch_known_indels + ch_pon main: @@ -29,7 +30,7 @@ workflow BUILD_INDICES{ BWAMEM2_INDEX(ch_fasta) if(!(params.dict) && params.fasta && !('annotate' in step) && !('controlfreec' in step)) - ATK_CREATE_SEQUENCE_DICTIONARY(ch_fasta) + GATK_CREATE_SEQUENCE_DICTIONARY(ch_fasta) if(!(params.fasta_fai) && params.fasta && !('annotate' in step)) SAMTOOLS_FAIDX(ch_fasta) @@ -43,13 +44,24 @@ workflow BUILD_INDICES{ if(!(params.known_indels_index) && params.known_indels && ('mapping' in step || 'preparerecalibration' in step)) HTSLIB_TABIX_KNOWN_INDELS(ch_known_indels) + if(!(params.pon_index) && params.pon && ('tnscope' in tools || 'mutect2' in tools)) + HTSLIB_TABIX_PON(ch_pon) + + if(!(params.intervals) && !('annotate' in step) && !('controlfreec' in step)){ + ch_fai = params.fasta_fai ? Channel.value(file(params.fasta_fai)) : SAMTOOLS_FAIDX.out + BUILD_INTERVALS(ch_fai) + } + + emit: - bwamem2_index = BWAMEM2_INDEX.out - gatk_dict = GATK_CREATE_SEQUENCE_DICTIONARY.out - samtools_faidx = SAMTOOLS_FAIDX.out - tabix_dbsnp = HTSLIB_TABIX_DBSNP.out - tabix_germline = HTSLIB_TABIX_GERMLINE_RESOURCE.out - tabix_indels = HTSLIB_TABIX_KNOWN_INDELS.out + bwa_built = BWAMEM2_INDEX.out + dictBuilt = GATK_CREATE_SEQUENCE_DICTIONARY.out + fai_built = SAMTOOLS_FAIDX.out + dbsnp_tbi = HTSLIB_TABIX_DBSNP.out + germline_resource_tbi = HTSLIB_TABIX_GERMLINE_RESOURCE.out + known_indels_tbi = HTSLIB_TABIX_KNOWN_INDELS.out + pon_tbi = HTSLIB_TABIX_PON.out + intervalBuilt = BUILD_INTERVALS.out } From a87e74c5db2e395df269a881ca5a8a6d10c071a1 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Wed, 15 Jul 2020 15:28:03 +0200 Subject: [PATCH 015/200] Fix number of input channels for subworkflow build_indices --- main.nf | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index a87f71001f..a12f6e800e 100644 --- a/main.nf +++ b/main.nf @@ -219,7 +219,6 @@ ch_cadd_indels = params.cadd_indels ? Channel.value(file(params.cadd_indels)) : ch_cadd_indels_tbi = params.cadd_indels_tbi ? Channel.value(file(params.cadd_indels_tbi)) : "null" ch_cadd_wg_snvs = params.cadd_wg_snvs ? Channel.value(file(params.cadd_wg_snvs)) : "null" ch_cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ? Channel.value(file(params.cadd_wg_snvs_tbi)) : "null" -ch_pon = params.pon ? Channel.value(file(params.pon)) : "null" ch_target_bed = params.target_bed ? Channel.value(file(params.target_bed)) : "null" /* @@ -316,11 +315,17 @@ ch_fasta = params.fasta && !('annotate' in step) ? Channel.value(file(params.fas ch_dbsnp = params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? Channel.value(file(params.dbsnp)) : "null" ch_germline_resource = params.germline_resource && 'mutect2' in tools ? Channel.value(file(params.germline_resource)) : "null" ch_known_indels = params.known_indels && ('mapping' in step || 'preparerecalibration' in step) ? Channel.value(file(params.known_indels)) : "null" +ch_pon = params.pon ? Channel.value(file(params.pon)) : "null" workflow { - BUILD_INDICES(step, ch_fasta, ch_dbsnp, ch_germline_resource, ch_known_indels) + BUILD_INDICES( step, + ch_fasta, + ch_dbsnp, + ch_germline_resource, + ch_known_indels, + ch_pon) FASTQC(inputSample) From 2f6e6eaa13dc10ff66d8304d6b793c0d1341f04a Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Wed, 15 Jul 2020 15:36:07 +0200 Subject: [PATCH 016/200] Move buidl indices to subworkflow folder --- main.nf | 2 +- .../{local/buildindices.nf => subworkflows/build_indices.nf} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename modules/{local/buildindices.nf => subworkflows/build_indices.nf} (100%) diff --git a/main.nf b/main.nf index a12f6e800e..c779f2aea6 100644 --- a/main.nf +++ b/main.nf @@ -307,7 +307,7 @@ include { MULTIQC } from './modules/nf-core/multiqc' params(params) RUN THE WORKFLOW ================================================================================ */ -include { BUILD_INDICES } from './modules/local/buildindices' addParams(params) +include { BUILD_INDICES } from './modules/subworkflows/build_indices' addParams(params) // params.fasta has to be the first one params.fasta = params.genome && !('annotate' in step) ? params.genomes[params.genome].fasta ?: null : null diff --git a/modules/local/buildindices.nf b/modules/subworkflows/build_indices.nf similarity index 100% rename from modules/local/buildindices.nf rename to modules/subworkflows/build_indices.nf From 5c733b0fed01a0309c7adfe152bcd42793b1bdc0 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Wed, 15 Jul 2020 15:42:01 +0200 Subject: [PATCH 017/200] Fix relativ paths for include --- modules/subworkflows/build_indices.nf | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/subworkflows/build_indices.nf b/modules/subworkflows/build_indices.nf index 7c423f295c..552772fbfa 100644 --- a/modules/subworkflows/build_indices.nf +++ b/modules/subworkflows/build_indices.nf @@ -13,8 +13,10 @@ include { HTSLIB_TABIX as HTSLIB_TABIX_KNOWN_INDELS } from '../nf-core/htslib_ta include { HTSLIB_TABIX as HTSLIB_TABIX_PON } from '../nf-core/htslib_tabix' include { BWAMEM2_INDEX as BWAMEM2_INDEX } from '../nf-core/bwamem2_index.nf' include { SAMTOOLS_FAIDX as SAMTOOLS_FAIDX } from '../nf-core/samtools_faidx.nf' -include { GATK_CREATE_SEQUENCE_DICTIONARY as GATK_CREATE_SEQUENCE_DICTIONARY } from './gatk_dict.nf' -include { BUILD_INTERVALS } from './build_intervals.nf' +include { GATK_CREATE_SEQUENCE_DICTIONARY as GATK_CREATE_SEQUENCE_DICTIONARY } from '../local/gatk_dict.nf' +include { BUILD_INTERVALS } from '../local/build_intervals.nf' + + workflow BUILD_INDICES{ take: step From 573761d7bc92168e09acca9270a958b671db0869 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Wed, 15 Jul 2020 15:51:08 +0200 Subject: [PATCH 018/200] Add OR to changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fe9f494a2..7ce1a5e809 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## [dsl2](https://github.com/nf-core/sarek/tree/dev) + +- [#238](https://github.com/nf-core/sarek/pull/238) -Add subworkflow for building all the indices + ## [dev](https://github.com/nf-core/sarek/tree/dev) - [#234](https://github.com/nf-core/sarek/pull/234) -Switching to DSL2 From b06abe273ebd651b53713dd9285f1fdf41ea0e2a Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 15 Jul 2020 16:24:27 +0200 Subject: [PATCH 019/200] add Sarek ascii art to header --- lib/Headers.groovy | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/lib/Headers.groovy b/lib/Headers.groovy index 8fd9c8c6fc..1d9d02eeb5 100644 --- a/lib/Headers.groovy +++ b/lib/Headers.groovy @@ -24,14 +24,21 @@ class Headers { Map colors = log_colours(monochrome_logs) String.format( """\n - -${colors.dim}----------------------------------------------------${colors.reset}- - ${colors.green},--.${colors.black}/${colors.green},-.${colors.reset} - ${colors.blue} ___ __ __ __ ___ ${colors.green}/,-._.--~\'${colors.reset} - ${colors.blue} |\\ | |__ __ / ` / \\ |__) |__ ${colors.yellow}} {${colors.reset} - ${colors.blue} | \\| | \\__, \\__/ | \\ |___ ${colors.green}\\`-._,-`-,${colors.reset} - ${colors.green}`._,._,\'${colors.reset} - ${colors.purple} ${workflow.manifest.name} v${workflow.manifest.version}${colors.reset} - -${colors.dim}----------------------------------------------------${colors.reset}- + -${colors.dim}----------------------------------------------------${colors.reset}- + ${colors.green},--.${colors.black}/${colors.green},-.${colors.reset} + ${colors.blue} ___ __ __ __ ___ ${colors.green}/,-._.--~\'${colors.reset} + ${colors.blue} |\\ | |__ __ / ` / \\ |__) |__ ${colors.yellow}} {${colors.reset} + ${colors.blue} | \\| | \\__, \\__/ | \\ |___ ${colors.green}\\`-._,-`-,${colors.reset} + ${colors.green}`._,._,\'${colors.reset} + ${colors.white}____${colors.reset} + ${colors.white}.´ _ `.${colors.reset} + ${colors.white}/ ${colors.green}|\\${colors.white}`-_ \\${colors.reset} ${colors.blue} __ __ ___ ${colors.reset} + ${colors.white}| ${colors.green}| \\${colors.white} `-|${colors.reset} ${colors.blue}|__` /\\ |__) |__ |__/${colors.reset} + ${colors.white}\\ ${colors.green}| \\${colors.white} /${colors.reset} ${colors.blue}.__| /¯¯\\ | \\ |___ | \\${colors.reset} + ${colors.white}`${colors.green}|${colors.white}____${colors.green}\\${colors.white}´${colors.reset} + + ${colors.purple} ${workflow.manifest.name} v${workflow.manifest.version}${colors.reset} +-${colors.dim}--------------------------------------------------${colors.reset}- """.stripIndent() ) } From 6367547fe4a36d07e3a58a2eb771c832b0065c3d Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Wed, 15 Jul 2020 16:34:50 +0200 Subject: [PATCH 020/200] Fix bwamem2 and other modules --- main.nf | 52 +++++++++++++-------------- modules/local/gatk_dict.nf | 4 +-- modules/nf-core/bwamem2_index.nf | 4 +-- modules/nf-core/samtools_faidx.nf | 4 +-- modules/subworkflows/build_indices.nf | 18 +++++----- 5 files changed, 40 insertions(+), 42 deletions(-) diff --git a/main.nf b/main.nf index c779f2aea6..f9adcc5dbb 100644 --- a/main.nf +++ b/main.nf @@ -43,7 +43,8 @@ include { hasExtension; defineToolList; checkParameterList; extractBam; - extractFastqFromDir } from './modules/local/functions' + extractFastqFromDir; + checkParameterExistence } from './modules/local/functions' /* ================================================================================ @@ -76,11 +77,11 @@ ch_output_docs_images = file("$baseDir/docs/images/", checkIfExists: true) stepList = defineStepList() step = params.step ? params.step.toLowerCase().replaceAll('-', '').replaceAll('_', '') : '' -// // Handle deprecation -// if (step == 'preprocessing') step = 'mapping' +// Handle deprecation +if (step == 'preprocessing') step = 'mapping' -// if (step.contains(',')) exit 1, 'You can choose only one step, see --help for more information' -// if (!checkParameterExistence(step, stepList)) exit 1, "Unknown step ${step}, see --help for more information" +if (step.contains(',')) exit 1, 'You can choose only one step, see --help for more information' +if (!checkParameterExistence(step, stepList)) exit 1, "Unknown step ${step}, see --help for more information" toolList = defineToolList() tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] @@ -179,6 +180,9 @@ if (tsvPath) { */ // Initialize each params in params.genomes, catch the command line first if it was defined +// params.fasta has to be the first one +params.fasta = params.genome && !('annotate' in step) ? params.genomes[params.genome].fasta ?: null : null + // The rest can be sorted params.ac_loci = params.genome && 'ascat' in tools ? params.genomes[params.genome].ac_loci ?: null : null @@ -201,6 +205,12 @@ params.species = params.genome && 'vep' in tools ? params.genomes[params.genome] params.vep_cache_version = params.genome && 'vep' in tools ? params.genomes[params.genome].vep_cache_version ?: null : null // Initialize channels based on params +ch_fasta = params.fasta && !('annotate' in step) ? Channel.value(file(params.fasta)) : "null" +ch_dbsnp = params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? Channel.value(file(params.dbsnp)) : "null" +ch_germline_resource = params.germline_resource && 'mutect2' in tools ? Channel.value(file(params.germline_resource)) : "null" +ch_known_indels = params.known_indels && ('mapping' in step || 'preparerecalibration' in step) ? Channel.value(file(params.known_indels)) : "null" +ch_pon = params.pon ? Channel.value(file(params.pon)) : "null" + ch_ac_loci = params.ac_loci && 'ascat' in tools ? Channel.value(file(params.ac_loci)) : "null" ch_ac_loci_gc = params.ac_loci_gc && 'ascat' in tools ? Channel.value(file(params.ac_loci_gc)) : "null" ch_chr_dir = params.chr_dir && 'controlfreec' in tools ? Channel.value(file(params.chr_dir)) : "null" @@ -309,14 +319,16 @@ include { MULTIQC } from './modules/nf-core/multiqc' params(params) */ include { BUILD_INDICES } from './modules/subworkflows/build_indices' addParams(params) -// params.fasta has to be the first one -params.fasta = params.genome && !('annotate' in step) ? params.genomes[params.genome].fasta ?: null : null -ch_fasta = params.fasta && !('annotate' in step) ? Channel.value(file(params.fasta)) : "null" -ch_dbsnp = params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? Channel.value(file(params.dbsnp)) : "null" -ch_germline_resource = params.germline_resource && 'mutect2' in tools ? Channel.value(file(params.germline_resource)) : "null" -ch_known_indels = params.known_indels && ('mapping' in step || 'preparerecalibration' in step) ? Channel.value(file(params.known_indels)) : "null" -ch_pon = params.pon ? Channel.value(file(params.pon)) : "null" +ch_fasta.dump(tag: 'ch_fasta') +//ch_bwa = params.bwa ? Channel.value(file(params.bwa)) : BUILD_INDICES.out.bwa_built +//ch_dict = params.dict ? Channel.value(file(params.dict)) : GATK_CREATE_SEQUENCE_DICTIONARY.out +//ch_fai = params.fasta_fai ? Channel.value(file(params.fasta_fai)) : SAMTOOLS_FAIDX.out +//ch_dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? Channel.value(file(params.dbsnp_index)) : dbsnp_tbi : "null" +//ch_germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? Channel.value(file(params.germline_resource_index)) : germline_resource_tbi : "null" +//ch_known_indels_tbi = params.known_indels ? params.known_indels_index ? Channel.value(file(params.known_indels_index)) : known_indels_tbi.collect() : "null" +//ch_pon_tbi = params.pon ? params.pon_index ? Channel.value(file(params.pon_index)) : pon_tbi : "null" +//ch_intervals = params.no_intervals ? "null" : params.intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : intervalBuilt workflow { @@ -327,6 +339,7 @@ workflow { ch_known_indels, ch_pon) + FASTQC(inputSample) OUTPUT_DOCUMENTATION( @@ -343,21 +356,6 @@ workflow { ch_workflow_summary) } -// ch_bwa = params.bwa ? Channel.value(file(params.bwa)) : build_indices.out.BWAMEM2_INDEX.out - -// ch_dict = params.dict ? Channel.value(file(params.dict)) : GATK_CREATE_SEQUENCE_DICTIONARY.out - -// ch_fai = params.fasta_fai ? Channel.value(file(params.fasta_fai)) : SAMTOOLS_FAIDX.out - -// ch_dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? Channel.value(file(params.dbsnp_index)) : dbsnp_tbi : "null" - -// ch_germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? Channel.value(file(params.germline_resource_index)) : germline_resource_tbi : "null" - -// ch_known_indels_tbi = params.known_indels ? params.known_indels_index ? Channel.value(file(params.known_indels_index)) : known_indels_tbi.collect() : "null" - -// ch_pon_tbi = params.pon ? params.pon_index ? Channel.value(file(params.pon_index)) : pon_tbi : "null" - -// ch_intervals = params.no_intervals ? "null" : params.intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : intervalBuilt /* ================================================================================ diff --git a/modules/local/gatk_dict.nf b/modules/local/gatk_dict.nf index d255da7438..70750d4990 100644 --- a/modules/local/gatk_dict.nf +++ b/modules/local/gatk_dict.nf @@ -5,10 +5,10 @@ process GATK_CREATE_SEQUENCE_DICTIONARY { saveAs: {params.save_reference ? "reference_genome/${it}" : null } input: - path file(fasta) + path fasta output: - path file("${fasta.baseName}.dict") + path ("${fasta.baseName}.dict") script: """ diff --git a/modules/nf-core/bwamem2_index.nf b/modules/nf-core/bwamem2_index.nf index ccd628282c..0206509753 100644 --- a/modules/nf-core/bwamem2_index.nf +++ b/modules/nf-core/bwamem2_index.nf @@ -5,10 +5,10 @@ process BWAMEM2_INDEX { saveAs: {params.save_reference ? "reference_genome/BWAIndex/${it}" : null } input: - path file(fasta) + path fasta output: - path file("${fasta}.*") + path("${fasta}.*") script: """ diff --git a/modules/nf-core/samtools_faidx.nf b/modules/nf-core/samtools_faidx.nf index 72d6d6f7c9..fcb2e5cb61 100644 --- a/modules/nf-core/samtools_faidx.nf +++ b/modules/nf-core/samtools_faidx.nf @@ -5,10 +5,10 @@ process SAMTOOLS_FAIDX { saveAs: {params.save_reference ? "reference_genome/${it}" : null } input: - path file(fasta) + path fasta output: - path file("${fasta}.fai") + path ("${fasta}.fai") //when: !(params.fasta_fai) && params.fasta && !('annotate' in step) diff --git a/modules/subworkflows/build_indices.nf b/modules/subworkflows/build_indices.nf index 552772fbfa..71d8a417c0 100644 --- a/modules/subworkflows/build_indices.nf +++ b/modules/subworkflows/build_indices.nf @@ -28,7 +28,7 @@ workflow BUILD_INDICES{ main: - if(!(params.bwa) && params.fasta && 'mapping' in step) + if(! (params.bwa) && params.fasta && 'mapping' in step) BWAMEM2_INDEX(ch_fasta) if(!(params.dict) && params.fasta && !('annotate' in step) && !('controlfreec' in step)) @@ -56,14 +56,14 @@ workflow BUILD_INDICES{ emit: - bwa_built = BWAMEM2_INDEX.out - dictBuilt = GATK_CREATE_SEQUENCE_DICTIONARY.out - fai_built = SAMTOOLS_FAIDX.out - dbsnp_tbi = HTSLIB_TABIX_DBSNP.out - germline_resource_tbi = HTSLIB_TABIX_GERMLINE_RESOURCE.out - known_indels_tbi = HTSLIB_TABIX_KNOWN_INDELS.out - pon_tbi = HTSLIB_TABIX_PON.out - intervalBuilt = BUILD_INTERVALS.out + bwa_built = BWAMEM2_INDEX.out + dictBuilt = GATK_CREATE_SEQUENCE_DICTIONARY.out + fai_built = SAMTOOLS_FAIDX.out + dbsnp_tbi = HTSLIB_TABIX_DBSNP.out + germline_resource_tbi = HTSLIB_TABIX_GERMLINE_RESOURCE.out + known_indels_tbi = HTSLIB_TABIX_KNOWN_INDELS.out + pon_tbi = HTSLIB_TABIX_PON.out + intervalBuilt = BUILD_INTERVALS.out } From b4f39de725e2d9b3276fafb800a013aff2f7543a Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Wed, 15 Jul 2020 18:09:32 +0200 Subject: [PATCH 021/200] Add preprocessing to workflow Co-authored-by: Gisela Gabernet Co-authored-by: Maxime Garcia --- assets/multiqc_config.yaml | 1 + bin/scrape_software_versions.py | 4 +- conf/base.config | 32 +-- lib/Schema.groovy | 6 +- main.nf | 363 ++++--------------------- modules/local/create_intervals_bed.nf | 50 ++++ modules/local/get_software_versions.nf | 6 +- modules/local/trim_galore.nf | 53 ++++ modules/nf-core/bwamem2_mem.nf | 24 ++ modules/nf-core/fastqc.nf | 5 - modules/nf-core/htslib_tabix.nf | 2 +- modules/nf-core/multiqc.nf | 5 +- modules/subworkflows/build_indices.nf | 33 +-- 13 files changed, 228 insertions(+), 356 deletions(-) create mode 100644 modules/local/create_intervals_bed.nf create mode 100644 modules/local/trim_galore.nf create mode 100644 modules/nf-core/bwamem2_mem.nf diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index 2865e59a99..49b2e00887 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -19,6 +19,7 @@ top_modules: name: 'FastQC' path_filters_exclude: - '*trimmed_fastqc*' +- 'cutadapt' - 'fastqc': name: 'FastQC after trimming' info: 'FastQC after applying TrimGalore.' diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index 3c05ee027d..2d878f7f72 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -7,7 +7,7 @@ 'AlleleCount': ['v_allelecount.txt', r"(\S+)"], 'ASCAT': ['v_ascat.txt', r"Version: (\S+)"], 'bcftools': ['v_bcftools.txt', r"bcftools (\S+)"], - 'BWA': ['v_bwa.txt', r"Version: (\S+)"], + 'BWAMEM2': ['v_bwamem2.txt', r"Version: (\S+)"], 'CNVkit': ['v_cnvkit.txt', r"(\S+)"], 'Control-FREEC': ['v_controlfreec.txt', r"Control-FREEC\s(\S+)"], 'FastQC': ['v_fastqc.txt', r"FastQC v(\S+)"], @@ -35,7 +35,7 @@ results['ASCAT'] = 'N/A' results['AlleleCount'] = 'N/A' results['bcftools'] = 'N/A' -results['BWA'] = 'N/A' +results['BWA-MEM2'] = 'N/A' results['CNVkit'] = 'N/A' results['Control-FREEC'] = 'N/A' results['FastQC'] = 'N/A' diff --git a/conf/base.config b/conf/base.config index e88657d6cd..79e339bd98 100644 --- a/conf/base.config +++ b/conf/base.config @@ -19,55 +19,55 @@ process { maxErrors = '-1' maxRetries = 3 - withLabel:cpus_1 { + withLabel:CPUS_1 { cpus = {check_resource(1)} } - withLabel:cpus_2 { + withLabel:CPUS_2 { cpus = {check_resource(2)} } - withLabel:cpus_4 { + withLabel:CPUS_4 { cpus = {check_resource(4)} } - withLabel:cpus_8 { + withLabel:CPUS_8 { cpus = {check_resource(8)} } - withLabel:cpus_16 { + withLabel:CPUS_16 { cpus = {check_resource(16)} } - withLabel:cpus_max { + withLabel:CPUS_MAX { cpus = {params.max_cpus} } - withLabel:memory_singleCPU_2_task { + withLabel:MEMORY_SINGLECPU_2_TASK { memory = {check_resource((params.single_cpu_mem as nextflow.util.MemoryUnit) * 2 * task.attempt)} } - withLabel:memory_singleCPU_task_sq { + withLabel:MEMORY_SINGLECPU_TASK_SQ { memory = {check_resource((params.single_cpu_mem as nextflow.util.MemoryUnit) * task.attempt * task.attempt)} } - withLabel:memory_max { + withLabel:MEMORY_MAX { memory = {params.max_memory} } - withName:Get_software_versions { + withName:GET_SOFTWARE_VERSIONS { cache = false } - withName:ConcatVCF { - // For unknown reasons, ConcatVCF sometimes fails with SIGPIPE + withName:CONCATVCF { + // For unknown reasons, CONCATVCF sometimes fails with SIGPIPE // (exit code 141). Rerunning the process will usually work. errorStrategy = {task.exitStatus == 141 ? 'retry' : 'terminate'} } - withLabel:FastQC { + withLabel:FASTQC { errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'} } - withName:MapReads { + withName:BWAMEM2_MEM { memory = {check_resource(60.GB * task.attempt)} time = {check_resource(48.h * task.attempt)} } - withName:MultiQC { + withName:MULTIQC { errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'} } - withName:Snpeff { + withName:SNPEFF { container = {(params.annotation_cache && params.snpeff_cache) ? 'nfcore/sarek:dev' : "nfcore/sareksnpeff:dev.${params.genome}"} errorStrategy = {task.exitStatus == 143 ? 'retry' : 'ignore'} } diff --git a/lib/Schema.groovy b/lib/Schema.groovy index f0a6ad148d..431eeed6d0 100644 --- a/lib/Schema.groovy +++ b/lib/Schema.groovy @@ -112,10 +112,10 @@ class JSON { static String params_mqc_summary(summary) { String yaml_file_text = """ - id: 'nf-core-tcrseq-summary' + id: 'nf-core-sarek-summary' description: " - this information is collected when the pipeline is started." - section_name: 'nf-core/tcrseq Workflow Summary' - section_href: 'https://github.com/nf-core/tcrseq' + section_name: 'nf-core/sarek Workflow Summary' + section_href: 'https://github.com/nf-core/sarek' plot_type: 'html' data: |
diff --git a/main.nf b/main.nf index f9adcc5dbb..0c6759fd90 100644 --- a/main.nf +++ b/main.nf @@ -318,29 +318,68 @@ include { MULTIQC } from './modules/nf-core/multiqc' params(params) ================================================================================ */ include { BUILD_INDICES } from './modules/subworkflows/build_indices' addParams(params) - +include { CREATE_INTERVALS_BED } from './modules/local/create_intervals_bed' addParams(params) +include { TRIM_GALORE } from './modules/local/trim_galore.nf' addParams(params) ch_fasta.dump(tag: 'ch_fasta') -//ch_bwa = params.bwa ? Channel.value(file(params.bwa)) : BUILD_INDICES.out.bwa_built -//ch_dict = params.dict ? Channel.value(file(params.dict)) : GATK_CREATE_SEQUENCE_DICTIONARY.out -//ch_fai = params.fasta_fai ? Channel.value(file(params.fasta_fai)) : SAMTOOLS_FAIDX.out -//ch_dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? Channel.value(file(params.dbsnp_index)) : dbsnp_tbi : "null" -//ch_germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? Channel.value(file(params.germline_resource_index)) : germline_resource_tbi : "null" -//ch_known_indels_tbi = params.known_indels ? params.known_indels_index ? Channel.value(file(params.known_indels_index)) : known_indels_tbi.collect() : "null" -//ch_pon_tbi = params.pon ? params.pon_index ? Channel.value(file(params.pon_index)) : pon_tbi : "null" -//ch_intervals = params.no_intervals ? "null" : params.intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : intervalBuilt -workflow { - BUILD_INDICES( step, - ch_fasta, - ch_dbsnp, - ch_germline_resource, - ch_known_indels, - ch_pon) +workflow { + // BUILD INDICES + + BUILD_INDICES( + ch_dbsnp, + ch_fasta, + ch_germline_resource, + ch_known_indels, + ch_pon, + step) + + ch_bwa = params.bwa ? Channel.value(file(params.bwa)) : BUILD_INDICES.out.bwa_built + ch_dict = params.dict ? Channel.value(file(params.dict)) : BUILD_INDICES.out.dictBuilt + ch_fai = params.fasta_fai ? Channel.value(file(params.fasta_fai)) : BUILD_INDICES.out.fai_built + ch_dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? Channel.value(file(params.dbsnp_index)) : BUILD_INDICES.out.dbsnp_tbi : "null" + ch_germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? Channel.value(file(params.germline_resource_index)) : BUILD_INDICES.out.germline_resource_tbi : "null" + ch_known_indels_tbi = params.known_indels ? params.known_indels_index ? Channel.value(file(params.known_indels_index)) : BUILD_INDICES.out.known_indels_tbi.collect() : "null" + ch_pon_tbi = params.pon ? params.pon_index ? Channel.value(file(params.pon_index)) : BUILD_INDICES.out.pon_tbi : "null" + ch_intervals = params.no_intervals ? "null" : params.intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : BUILD_INDICES.out.intervalBuilt + ch_intervals.dump(tag: 'ch_intervals') + + // PREPROCESSING + if((!params.no_intervals) && step != 'annotate') + CREATE_INTERVALS_BED(ch_intervals) + + // BED INTERVAL CHANNEL TRANSFORMING + ch_bed_intervals = CREATE_INTERVALS_BED.out + .flatten() + .map { intervalFile -> + def duration = 0.0 + for (line in intervalFile.readLines()) { + final fields = line.split('\t') + if (fields.size() >= 5) duration += fields[4].toFloat() + else { + start = fields[1].toInteger() + end = fields[2].toInteger() + duration += (end - start) / params.nucleotides_per_second + } + } + [ duration, intervalFile] + }.toSortedList({ a, b -> b[0] <=> a[0] }) + .flatten().collate(2) + .map{duration, intervalFile -> intervalFile} + ch_bed_intervals.dump(tag:'bedintervals') + + if (params.no_intervals && step != 'annotate') { + file("${params.outdir}/no_intervals.bed").text = "no_intervals\n" + ch_bed_intervals = Channel.from(file("${params.outdir}/no_intervals.bed")) + } + //if(!('fastqc' in skipQC)) FASTQC(inputSample) + + if(params.trim_fastq) + TRIM_GALORE(inputSample) OUTPUT_DOCUMENTATION( ch_output_docs, @@ -351,8 +390,9 @@ workflow { MULTIQC( ch_multiqc_config, ch_multiqc_custom_config.collect().ifEmpty([]), - FASTQC.out.collect(), - GET_SOFTWARE_VERSIONS.out.software_versions_yml.collect(), + FASTQC.out.collect().ifEmpty([]), + TRIM_GALORE.out.report.collect().ifEmpty([]), + GET_SOFTWARE_VERSIONS.out.yml.collect(), ch_workflow_summary) } @@ -369,192 +409,6 @@ workflow.onComplete { } -// /* -// ================================================================================ -// BUILDING INDEXES -// ================================================================================ -// */ - -// // And then initialize channels based on params or indexes that were just built - -// process BuildBWAindexes { -// tag "${fasta}" - -// publishDir params.outdir, mode: params.publish_dir_mode, -// saveAs: {params.save_reference ? "reference_genome/BWAIndex/${it}" : null } - -// input: -// file(fasta) from ch_fasta - -// output: -// file("${fasta}.*") into bwa_built - -// when: !(params.bwa) && params.fasta && 'mapping' in step - -// script: -// """ -// bwa index ${fasta} -// """ -// } - -// ch_bwa = params.bwa ? Channel.value(file(params.bwa)) : bwa_built - -// process BuildDict { -// tag "${fasta}" - -// publishDir params.outdir, mode: params.publish_dir_mode, -// saveAs: {params.save_reference ? "reference_genome/${it}" : null } - -// input: -// file(fasta) from ch_fasta - -// output: -// file("${fasta.baseName}.dict") into dictBuilt - -// when: !(params.dict) && params.fasta && !('annotate' in step) && !('controlfreec' in step) - -// script: -// """ -// gatk --java-options "-Xmx${task.memory.toGiga()}g" \ -// CreateSequenceDictionary \ -// --REFERENCE ${fasta} \ -// --OUTPUT ${fasta.baseName}.dict -// """ -// } - -// ch_dict = params.dict ? Channel.value(file(params.dict)) : dictBuilt - -// process BuildFastaFai { -// tag "${fasta}" - -// publishDir params.outdir, mode: params.publish_dir_mode, -// saveAs: {params.save_reference ? "reference_genome/${it}" : null } - -// input: -// file(fasta) from ch_fasta - -// output: -// file("${fasta}.fai") into fai_built - -// when: !(params.fasta_fai) && params.fasta && !('annotate' in step) - -// script: -// """ -// samtools faidx ${fasta} -// """ -// } - -// ch_fai = params.fasta_fai ? Channel.value(file(params.fasta_fai)) : fai_built - -// process BuildDbsnpIndex { -// tag "${dbsnp}" - -// publishDir params.outdir, mode: params.publish_dir_mode, -// saveAs: {params.save_reference ? "reference_genome/${it}" : null } - -// input: -// file(dbsnp) from ch_dbsnp - -// output: -// file("${dbsnp}.tbi") into dbsnp_tbi - -// when: !(params.dbsnp_index) && params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || 'tnscope' in tools) - -// script: -// """ -// tabix -p vcf ${dbsnp} -// """ -// } - -// ch_dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? Channel.value(file(params.dbsnp_index)) : dbsnp_tbi : "null" - -// process BuildGermlineResourceIndex { -// tag "${germlineResource}" - -// publishDir params.outdir, mode: params.publish_dir_mode, -// saveAs: {params.save_reference ? "reference_genome/${it}" : null } - -// input: -// file(germlineResource) from ch_germline_resource - -// output: -// file("${germlineResource}.tbi") into germline_resource_tbi - -// when: !(params.germline_resource_index) && params.germline_resource && 'mutect2' in tools - -// script: -// """ -// tabix -p vcf ${germlineResource} -// """ -// } - -// ch_germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? Channel.value(file(params.germline_resource_index)) : germline_resource_tbi : "null" - -// process BuildKnownIndelsIndex { -// tag "${knownIndels}" - -// publishDir params.outdir, mode: params.publish_dir_mode, -// saveAs: {params.save_reference ? "reference_genome/${it}" : null } - -// input: -// each file(knownIndels) from ch_known_indels - -// output: -// file("${knownIndels}.tbi") into known_indels_tbi - -// when: !(params.known_indels_index) && params.known_indels && ('mapping' in step || 'preparerecalibration' in step) - -// script: -// """ -// tabix -p vcf ${knownIndels} -// """ -// } - -// ch_known_indels_tbi = params.known_indels ? params.known_indels_index ? Channel.value(file(params.known_indels_index)) : known_indels_tbi.collect() : "null" - -// process BuildPonIndex { -// tag "${pon}" - -// publishDir params.outdir, mode: params.publish_dir_mode, -// saveAs: {params.save_reference ? "reference_genome/${it}" : null } - -// input: -// file(pon) from ch_pon - -// output: -// file("${pon}.tbi") into pon_tbi - -// when: !(params.pon_index) && params.pon && ('tnscope' in tools || 'mutect2' in tools) - -// script: -// """ -// tabix -p vcf ${pon} -// """ -// } - -// ch_pon_tbi = params.pon ? params.pon_index ? Channel.value(file(params.pon_index)) : pon_tbi : "null" - -// process BuildIntervals { -// tag "${fastaFai}" - -// publishDir params.outdir, mode: params.publish_dir_mode, -// saveAs: {params.save_reference ? "reference_genome/${it}" : null } - -// input: -// file(fastaFai) from ch_fai - -// output: -// file("${fastaFai.baseName}.bed") into intervalBuilt - -// when: !(params.intervals) && !('annotate' in step) && !('controlfreec' in step) - -// script: -// """ -// awk -v FS='\t' -v OFS='\t' '{ print \$1, \"0\", \$2 }' ${fastaFai} > ${fastaFai.baseName}.bed -// """ -// } - -// ch_intervals = params.no_intervals ? "null" : params.intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : intervalBuilt // /* // ================================================================================ @@ -562,83 +416,7 @@ workflow.onComplete { // ================================================================================ // */ -// // STEP 0: CREATING INTERVALS FOR PARALLELIZATION (PREPROCESSING AND VARIANT CALLING) - -// process CreateIntervalBeds { -// tag "${intervals}" - -// input: -// file(intervals) from ch_intervals - -// output: -// file '*.bed' into bedIntervals mode flatten - -// when: (!params.no_intervals) && step != 'annotate' - -// script: -// // If the interval file is BED format, the fifth column is interpreted to -// // contain runtime estimates, which is then used to combine short-running jobs -// if (hasExtension(intervals, "bed")) -// """ -// awk -vFS="\t" '{ -// t = \$5 # runtime estimate -// if (t == "") { -// # no runtime estimate in this row, assume default value -// t = (\$3 - \$2) / ${params.nucleotides_per_second} -// } -// if (name == "" || (chunk > 600 && (chunk + t) > longest * 1.05)) { -// # start a new chunk -// name = sprintf("%s_%d-%d.bed", \$1, \$2+1, \$3) -// chunk = 0 -// longest = 0 -// } -// if (t > longest) -// longest = t -// chunk += t -// print \$0 > name -// }' ${intervals} -// """ -// else if (hasExtension(intervals, "interval_list")) -// """ -// grep -v '^@' ${intervals} | awk -vFS="\t" '{ -// name = sprintf("%s_%d-%d", \$1, \$2, \$3); -// printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed" -// }' -// """ -// else -// """ -// awk -vFS="[:-]" '{ -// name = sprintf("%s_%d-%d", \$1, \$2, \$3); -// printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed" -// }' ${intervals} -// """ -// } - -// bedIntervals = bedIntervals -// .map { intervalFile -> -// def duration = 0.0 -// for (line in intervalFile.readLines()) { -// final fields = line.split('\t') -// if (fields.size() >= 5) duration += fields[4].toFloat() -// else { -// start = fields[1].toInteger() -// end = fields[2].toInteger() -// duration += (end - start) / params.nucleotides_per_second -// } -// } -// [duration, intervalFile] -// }.toSortedList({ a, b -> b[0] <=> a[0] }) -// .flatten().collate(2) -// .map{duration, intervalFile -> intervalFile} - -// bedIntervals = bedIntervals.dump(tag:'bedintervals') - -// if (params.no_intervals && step != 'annotate') { -// file("${params.outdir}/no_intervals.bed").text = "no_intervals\n" -// bedIntervals = Channel.from(file("${params.outdir}/no_intervals.bed")) -// } - -// (intBaseRecalibrator, intApplyBQSR, intHaplotypeCaller, intFreebayesSingle, intMpileup, bedIntervals) = bedIntervals.into(6) +// (intBaseRecalibrator, intApplyBQSR, intHaplotypeCaller, intFreebayesSingle, intMpileup, bedIntervals) = bedIntervals.into(6) // // STEP 0.5: QC ON READS @@ -646,27 +424,6 @@ workflow.onComplete { // // TODO: Use only one process for FastQC for FASTQ files and uBAM files // // FASTQ and uBAM files are renamed based on the sample name -// process FastQCFQ { -// label 'FastQC' -// label 'cpus_2' - -// tag "${idPatient}-${idRun}" - -// publishDir "${params.outdir}/Reports/${idSample}/FastQC/${idSample}_${idRun}", mode: params.publish_dir_mode - -// input: -// set idPatient, idSample, idRun, file("${idSample}_${idRun}_R1.fastq.gz"), file("${idSample}_${idRun}_R2.fastq.gz") from inputPairReadsFastQC - -// output: -// file("*.{html,zip}") into fastQCFQReport - -// when: !('fastqc' in skipQC) - -// script: -// """ -// fastqc -t 2 -q ${idSample}_${idRun}_R1.fastq.gz ${idSample}_${idRun}_R2.fastq.gz -// """ -// } // process FastQCBAM { // label 'FastQC' @@ -692,8 +449,6 @@ workflow.onComplete { // fastQCReport = fastQCFQReport.mix(fastQCBAMReport) -// fastQCReport = fastQCReport.dump(tag:'FastQC') - // process TrimGalore { // label 'TrimGalore' diff --git a/modules/local/create_intervals_bed.nf b/modules/local/create_intervals_bed.nf new file mode 100644 index 0000000000..a5469fecb1 --- /dev/null +++ b/modules/local/create_intervals_bed.nf @@ -0,0 +1,50 @@ +include { hasExtension } from './functions' + +process CREATE_INTERVALS_BED { + tag "${intervals}" + + input: + path intervals + + output: + path ('*.bed')//mode flatten + + + script: + // If the interval file is BED format, the fifth column is interpreted to + // contain runtime estimates, which is then used to combine short-running jobs + if (hasExtension(intervals, "bed")) + """ + awk -vFS="\t" '{ + t = \$5 # runtime estimate + if (t == "") { + # no runtime estimate in this row, assume default value + t = (\$3 - \$2) / ${params.nucleotides_per_second} + } + if (name == "" || (chunk > 600 && (chunk + t) > longest * 1.05)) { + # start a new chunk + name = sprintf("%s_%d-%d.bed", \$1, \$2+1, \$3) + chunk = 0 + longest = 0 + } + if (t > longest) + longest = t + chunk += t + print \$0 > name + }' ${intervals} + """ + else if (hasExtension(intervals, "interval_list")) + """ + grep -v '^@' ${intervals} | awk -vFS="\t" '{ + name = sprintf("%s_%d-%d", \$1, \$2, \$3); + printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed" + }' + """ + else + """ + awk -vFS="[:-]" '{ + name = sprintf("%s_%d-%d", \$1, \$2, \$3); + printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed" + }' ${intervals} + """ +} \ No newline at end of file diff --git a/modules/local/get_software_versions.nf b/modules/local/get_software_versions.nf index 5d29cf8319..1b70ecddbc 100644 --- a/modules/local/get_software_versions.nf +++ b/modules/local/get_software_versions.nf @@ -9,14 +9,14 @@ process GET_SOFTWARE_VERSIONS { } output: - path 'software_versions_mqc.yaml', emit: software_versions_yml - path "software_versions.csv", emit: software_versions_csv + path 'software_versions_mqc.yaml', emit: yml + path "software_versions.csv", emit: csv script: """ alleleCounter --version &> v_allelecount.txt 2>&1 || true bcftools --version &> v_bcftools.txt 2>&1 || true - bwa &> v_bwa.txt 2>&1 || true + bwa-mem2 version &> v_bwamem2.txt 2>&1 || true cnvkit.py version &> v_cnvkit.txt 2>&1 || true configManta.py --version &> v_manta.txt 2>&1 || true configureStrelkaGermlineWorkflow.py --version &> v_strelka.txt 2>&1 || true diff --git a/modules/local/trim_galore.nf b/modules/local/trim_galore.nf new file mode 100644 index 0000000000..fa3e69e8c1 --- /dev/null +++ b/modules/local/trim_galore.nf @@ -0,0 +1,53 @@ +process TRIM_GALORE { + label 'TrimGalore' + + tag "${idPatient}-${idRun}" + + publishDir "${params.outdir}/Reports/${idSample}/TrimGalore/${idSample}_${idRun}", mode: params.publish_dir_mode, + saveAs: {filename -> + if (filename.indexOf("_fastqc") > 0) "FastQC/$filename" + else if (filename.indexOf("trimming_report.txt") > 0) "logs/$filename" + else if (params.save_trimmed) filename + else null + } + + input: + tuple val(idPatient), val(idSample), val(idRun), file("${idSample}_${idRun}_R1.fastq.gz"), file("${idSample}_${idRun}_R2.fastq.gz") + + output: + path "*.{html,zip,txt}", emit: report + tuple idPatient, idSample, idRun, file("${idSample}_${idRun}_R1_val_1.fq.gz"), file("${idSample}_${idRun}_R2_val_2.fq.gz") , emit: trimmed_reads + + + script: + // Calculate number of --cores for TrimGalore based on value of task.cpus + // See: https://github.com/FelixKrueger/TrimGalore/blob/master/Changelog.md#version-060-release-on-1-mar-2019 + // See: https://github.com/nf-core/atacseq/pull/65 + def cores = 1 + if (task.cpus) { + cores = (task.cpus as int) - 4 + if (cores < 1) cores = 1 + if (cores > 4) cores = 4 + } + c_r1 = params.clip_r1 > 0 ? "--clip_r1 ${params.clip_r1}" : '' + c_r2 = params.clip_r2 > 0 ? "--clip_r2 ${params.clip_r2}" : '' + tpc_r1 = params.three_prime_clip_r1 > 0 ? "--three_prime_clip_r1 ${params.three_prime_clip_r1}" : '' + tpc_r2 = params.three_prime_clip_r2 > 0 ? "--three_prime_clip_r2 ${params.three_prime_clip_r2}" : '' + nextseq = params.trim_nextseq > 0 ? "--nextseq ${params.trim_nextseq}" : '' + """ + trim_galore \ + --cores ${cores} \ + --paired \ + --fastqc \ + --gzip \ + ${c_r1} ${c_r2} \ + ${tpc_r1} ${tpc_r2} \ + ${nextseq} \ + ${idSample}_${idRun}_R1.fastq.gz ${idSample}_${idRun}_R2.fastq.gz + + mv *val_1_fastqc.html "${idSample}_${idRun}_R1.trimmed_fastqc.html" + mv *val_2_fastqc.html "${idSample}_${idRun}_R2.trimmed_fastqc.html" + mv *val_1_fastqc.zip "${idSample}_${idRun}_R1.trimmed_fastqc.zip" + mv *val_2_fastqc.zip "${idSample}_${idRun}_R2.trimmed_fastqc.zip" + """ +} diff --git a/modules/nf-core/bwamem2_mem.nf b/modules/nf-core/bwamem2_mem.nf new file mode 100644 index 0000000000..1f73664abd --- /dev/null +++ b/modules/nf-core/bwamem2_mem.nf @@ -0,0 +1,24 @@ +params.bwa_options = "-M -B 2" +params.sequencer = "ILLUMINA" + +process BWAMEM2_MEM { + tag {id} + + publishDir "${params.outdir}/bwamem2_mem", mode: 'copy' + + input: + tuple val(id), path(reads) + path genomeindex + val indexprefix + + output: + tuple path("*.bam"), path("*.bai") + + script: + CN = params.sequencing_center ? "\\tCN:${params.sequencing_center}\\t" : "" + """ + bwa-mem2 mem -t ${task.cpus} -R "@RG\\tID:${id}${CN}\\tLB:${id}\\tSM:${id}\\tPL:${params.sequencer}" \\ + ${params.bwa_options} ${indexprefix} ${reads} | samtools sort -@8 -O BAM -o ${id}.bam - + samtools index ${id}.bam + """ +} \ No newline at end of file diff --git a/modules/nf-core/fastqc.nf b/modules/nf-core/fastqc.nf index d152faf568..b7e58261e1 100644 --- a/modules/nf-core/fastqc.nf +++ b/modules/nf-core/fastqc.nf @@ -1,6 +1,3 @@ -/* - * FastQC - */ process FASTQC { label 'FastQC' label 'cpus_2' @@ -15,11 +12,9 @@ process FASTQC { output: path "*.{html,zip}" - // when: !('fastqc' in skipQC) script: """ fastqc -t 2 -q ${idSample}_${idRun}_R1.fastq.gz ${idSample}_${idRun}_R2.fastq.gz """ } - diff --git a/modules/nf-core/htslib_tabix.nf b/modules/nf-core/htslib_tabix.nf index 146ccb664f..8786ce63f1 100644 --- a/modules/nf-core/htslib_tabix.nf +++ b/modules/nf-core/htslib_tabix.nf @@ -13,4 +13,4 @@ process HTSLIB_TABIX { """ tabix -p vcf ${vcf} """ -} \ No newline at end of file +} diff --git a/modules/nf-core/multiqc.nf b/modules/nf-core/multiqc.nf index 6cec640723..5daf608196 100644 --- a/modules/nf-core/multiqc.nf +++ b/modules/nf-core/multiqc.nf @@ -5,9 +5,6 @@ if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { custom_runName = workflow.runName } -/* - * MultiQC - */ process MULTIQC { publishDir "${params.outdir}/multiqc", mode: params.publish_dir_mode @@ -16,6 +13,7 @@ process MULTIQC { path mqc_custom_config // TODO nf-core: Add in log files from your new processes for MultiQC to find! path fastqc + path trim_galore path software_versions val workflow_summary @@ -28,7 +26,6 @@ process MULTIQC { rtitle = custom_runName ? "--title \"$custom_runName\"" : '' rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : '' - // TODO nf-core: Specify which MultiQC modules to use with -m for a faster run time """ echo '$workflow_summary' > workflow_summary_mqc.yaml multiqc -f $rtitle $rfilename $custom_config_file . diff --git a/modules/subworkflows/build_indices.nf b/modules/subworkflows/build_indices.nf index 71d8a417c0..6af688b36a 100644 --- a/modules/subworkflows/build_indices.nf +++ b/modules/subworkflows/build_indices.nf @@ -19,42 +19,42 @@ include { BUILD_INTERVALS } from '../local/build_intervals.nf' workflow BUILD_INDICES{ take: - step - ch_fasta - ch_dbsnp - ch_germline_resource - ch_known_indels - ch_pon + ch_dbsnp + ch_fasta + ch_germline_resource + ch_known_indels + ch_pon + step main: - if(! (params.bwa) && params.fasta && 'mapping' in step) + if (!(params.bwa) && params.fasta && 'mapping' in step) BWAMEM2_INDEX(ch_fasta) - if(!(params.dict) && params.fasta && !('annotate' in step) && !('controlfreec' in step)) + if (!(params.dict) && params.fasta && !('annotate' in step) && !('controlfreec' in step)) GATK_CREATE_SEQUENCE_DICTIONARY(ch_fasta) - if(!(params.fasta_fai) && params.fasta && !('annotate' in step)) + if (!(params.fasta_fai) && params.fasta && !('annotate' in step)) SAMTOOLS_FAIDX(ch_fasta) - if(!(params.dbsnp_index) && params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || 'tnscope' in tools)) + if (!(params.dbsnp_index) && params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || 'tnscope' in tools)) HTSLIB_TABIX_DBSNP(ch_dbsnp) - if(!(params.germline_resource_index) && params.germline_resource && 'mutect2' in tools) + if (!(params.germline_resource_index) && params.germline_resource && 'mutect2' in tools) HTSLIB_TABIX_GERMLINE_RESOURCE(ch_germline_resource) - if(!(params.known_indels_index) && params.known_indels && ('mapping' in step || 'preparerecalibration' in step)) + if (!(params.known_indels_index) && params.known_indels && ('mapping' in step || 'preparerecalibration' in step)) HTSLIB_TABIX_KNOWN_INDELS(ch_known_indels) - if(!(params.pon_index) && params.pon && ('tnscope' in tools || 'mutect2' in tools)) + if (!(params.pon_index) && params.pon && ('tnscope' in tools || 'mutect2' in tools)) HTSLIB_TABIX_PON(ch_pon) - if(!(params.intervals) && !('annotate' in step) && !('controlfreec' in step)){ + if (!(params.intervals) && !('annotate' in step) && !('controlfreec' in step)){ ch_fai = params.fasta_fai ? Channel.value(file(params.fasta_fai)) : SAMTOOLS_FAIDX.out + ch_fai.dump(tag: 'ch_fai') BUILD_INTERVALS(ch_fai) } - emit: bwa_built = BWAMEM2_INDEX.out dictBuilt = GATK_CREATE_SEQUENCE_DICTIONARY.out @@ -64,7 +64,4 @@ workflow BUILD_INDICES{ known_indels_tbi = HTSLIB_TABIX_KNOWN_INDELS.out pon_tbi = HTSLIB_TABIX_PON.out intervalBuilt = BUILD_INTERVALS.out - } - - From 2393a3171df2433a118c68bdcc11eaed00bd7f04 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Wed, 15 Jul 2020 18:14:23 +0200 Subject: [PATCH 022/200] Update CHANGELOG.md Co-authored-by: Gisela Gabernet Co-authored-by: Maxime Garcia --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ce1a5e809..bac12f4210 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a ## [dsl2](https://github.com/nf-core/sarek/tree/dev) - [#238](https://github.com/nf-core/sarek/pull/238) -Add subworkflow for building all the indices - +- [#241](https://github.com/nf-core/sarek/pull/241) -Add modules and workflows parts for preprocessing steps ## [dev](https://github.com/nf-core/sarek/tree/dev) - [#234](https://github.com/nf-core/sarek/pull/234) -Switching to DSL2 From 727cb1e396f1a835781bbb3f3fda945c45b5d6d3 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 15 Jul 2020 23:00:56 +0200 Subject: [PATCH 023/200] fix header when ansi-log is false --- lib/Headers.groovy | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/Headers.groovy b/lib/Headers.groovy index 1d9d02eeb5..ee3817cfde 100644 --- a/lib/Headers.groovy +++ b/lib/Headers.groovy @@ -23,8 +23,8 @@ class Headers { static String nf_core(workflow, monochrome_logs) { Map colors = log_colours(monochrome_logs) String.format( - """\n - -${colors.dim}----------------------------------------------------${colors.reset}- +""" +-${colors.dim}----------------------------------------------------${colors.reset}- ${colors.green},--.${colors.black}/${colors.green},-.${colors.reset} ${colors.blue} ___ __ __ __ ___ ${colors.green}/,-._.--~\'${colors.reset} ${colors.blue} |\\ | |__ __ / ` / \\ |__) |__ ${colors.yellow}} {${colors.reset} @@ -39,7 +39,7 @@ class Headers { ${colors.purple} ${workflow.manifest.name} v${workflow.manifest.version}${colors.reset} -${colors.dim}--------------------------------------------------${colors.reset}- - """.stripIndent() +""".stripIndent() ) } } From 8b7d4316547a015c9cd6f314d0e11ab7a2fa1706 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 00:04:14 +0200 Subject: [PATCH 024/200] improve bwamem2_mem module --- modules/local/bwamem2_mem.nf | 29 +++++++++++++++++++++++++++++ modules/local/mapreads.nf | 0 modules/nf-core/bwamem2_mem.nf | 24 ------------------------ 3 files changed, 29 insertions(+), 24 deletions(-) create mode 100644 modules/local/bwamem2_mem.nf delete mode 100644 modules/local/mapreads.nf delete mode 100644 modules/nf-core/bwamem2_mem.nf diff --git a/modules/local/bwamem2_mem.nf b/modules/local/bwamem2_mem.nf new file mode 100644 index 0000000000..7e425534d7 --- /dev/null +++ b/modules/local/bwamem2_mem.nf @@ -0,0 +1,29 @@ +params.bwa_options = "-K 100000000 -M" +params.sequencer = "ILLUMINA" + +process BWAMEM2_MEM { + label 'CPUS_MAX' + + tag "${sample}_${run}" + + publishDir "${params.outdir}/bwamem2_mem", mode: 'copy' + + input: + tuple val(patient), val(sample), val(run), path(read1), path(read2) + path bwa + path fasta + path fai + + output: + tuple val(patient), val(sample), val(run), path("*.bam"), path("*.bai") + + script: + CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : "" + readGroup = "@RG\\tID:${run}\\t${CN}PU:${run}\\tSM:${sample}\\tLB:${sample}\\tPL:${params.sequencer}" + """ + bwa-mem2 mem ${params.bwa_options} -R \"${readGroup}\" -t ${task.cpus} \ + ${fasta} ${read1} ${read2} | \ + samtools sort --threads ${task.cpus} -m 2G - > ${sample}_${run}.bam + samtools index ${sample}_${run}.bam + """ +} \ No newline at end of file diff --git a/modules/local/mapreads.nf b/modules/local/mapreads.nf deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/modules/nf-core/bwamem2_mem.nf b/modules/nf-core/bwamem2_mem.nf deleted file mode 100644 index 1f73664abd..0000000000 --- a/modules/nf-core/bwamem2_mem.nf +++ /dev/null @@ -1,24 +0,0 @@ -params.bwa_options = "-M -B 2" -params.sequencer = "ILLUMINA" - -process BWAMEM2_MEM { - tag {id} - - publishDir "${params.outdir}/bwamem2_mem", mode: 'copy' - - input: - tuple val(id), path(reads) - path genomeindex - val indexprefix - - output: - tuple path("*.bam"), path("*.bai") - - script: - CN = params.sequencing_center ? "\\tCN:${params.sequencing_center}\\t" : "" - """ - bwa-mem2 mem -t ${task.cpus} -R "@RG\\tID:${id}${CN}\\tLB:${id}\\tSM:${id}\\tPL:${params.sequencer}" \\ - ${params.bwa_options} ${indexprefix} ${reads} | samtools sort -@8 -O BAM -o ${id}.bam - - samtools index ${id}.bam - """ -} \ No newline at end of file From ac503af731988c4c04d246a66809eb2c3ca0d478 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 00:06:14 +0200 Subject: [PATCH 025/200] snake_case all functions --- modules/local/functions.nf | 92 +++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/modules/local/functions.nf b/modules/local/functions.nf index 3e9bb62ccd..d90e1bc024 100644 --- a/modules/local/functions.nf +++ b/modules/local/functions.nf @@ -5,13 +5,13 @@ */ // Check if a row has the expected number of item -def checkNumberOfItem(row, number) { +def check_number_of_item(row, number) { if (row.size() != number) exit 1, "Malformed row in TSV file: ${row}, see --help for more information" return true } // Check parameter existence -def checkParameterExistence(it, list) { +def check_parameter_existence(it, list) { if (!list.contains(it)) { log.warn "Unknown parameter: ${it}" return false @@ -20,12 +20,12 @@ def checkParameterExistence(it, list) { } // Compare each parameter with a list of parameters -def checkParameterList(list, realList) { - return list.every{ checkParameterExistence(it, realList) } +def check_parameter_list(list, realList) { + return list.every{ check_parameter_existence(it, realList) } } // Define list of available tools to annotate -def defineAnnoList() { +def define_anno_list() { return [ 'haplotypecaller', 'manta', @@ -36,7 +36,7 @@ def defineAnnoList() { } // Define list of skipable QC tools -def defineSkipQClist() { +def define_skip_qc_list() { return [ 'bamqc', 'baserecalibrator', @@ -53,7 +53,7 @@ def defineSkipQClist() { } // Define list of available step -def defineStepList() { +def define_step_list() { return [ 'annotate', 'controlfreec', @@ -65,7 +65,7 @@ def defineStepList() { } // Define list of available tools -def defineToolList() { +def define_tool_list() { return [ 'ascat', 'cnvkit', @@ -89,20 +89,20 @@ def defineToolList() { // Channeling the TSV file containing BAM. // Format is: "subject gender status sample bam bai" -def extractBam(tsvFile) { +def extract_bam(tsvFile) { Channel.from(tsvFile) .splitCsv(sep: '\t') .map { row -> - checkNumberOfItem(row, 6) + check_number_of_item(row, 6) def idPatient = row[0] def gender = row[1] - def status = returnStatus(row[2].toInteger()) + def status = return_status(row[2].toInteger()) def idSample = row[3] - def bamFile = returnFile(row[4]) - def baiFile = returnFile(row[5]) + def bamFile = return_file(row[4]) + def baiFile = return_file(row[5]) - if (!hasExtension(bamFile, "bam")) exit 1, "File: ${bamFile} has the wrong extension. See --help for more information" - if (!hasExtension(baiFile, "bai")) exit 1, "File: ${baiFile} has the wrong extension. See --help for more information" + if (!has_extension(bamFile, "bam")) exit 1, "File: ${bamFile} has the wrong extension. See --help for more information" + if (!has_extension(baiFile, "bai")) exit 1, "File: ${baiFile} has the wrong extension. See --help for more information" return [idPatient, gender, status, idSample, bamFile, baiFile] } @@ -111,7 +111,7 @@ def extractBam(tsvFile) { // Create a channel of germline FASTQs from a directory pattern: "my_samples/*/" // All FASTQ files in subdirectories are collected and emitted; // they must have _R1_ and _R2_ in their names. -def extractFastqFromDir(pattern) { +def extract_fastq_from_dir(pattern) { def fastq = Channel.create() // a temporary channel does all the work Channel @@ -125,7 +125,7 @@ def extractFastqFromDir(pattern) { assert path1.getName().contains('_R1_') path2 = file(path1.toString().replace('_R1_', '_R2_')) if (!path2.exists()) error "Path '${path2}' not found" - (flowcell, lane) = flowcellLaneFromFastq(path1) + (flowcell, lane) = flowcellLane_from_fastq(path1) patient = sampleId gender = 'ZZ' // unused status = 0 // normal (not tumor) @@ -138,7 +138,7 @@ def extractFastqFromDir(pattern) { } // Extract gender and status from Channel -def extractInfos(channel) { +def extract_infos(channel) { def genderMap = [:] def statusMap = [:] channel = channel.map{ it -> @@ -156,26 +156,26 @@ def extractInfos(channel) { // Channeling the TSV file containing FASTQ or BAM // Format is: "subject gender status sample lane fastq1 fastq2" // or: "subject gender status sample lane bam" -def extractFastq(tsvFile) { +def extract_fastq(tsvFile) { Channel.from(tsvFile) .splitCsv(sep: '\t') .map { row -> def idPatient = row[0] def gender = row[1] - def status = returnStatus(row[2].toInteger()) + def status = return_status(row[2].toInteger()) def idSample = row[3] def idRun = row[4] - def file1 = returnFile(row[5]) + def file1 = return_file(row[5]) def file2 = "null" - if (hasExtension(file1, "fastq.gz") || hasExtension(file1, "fq.gz") || hasExtension(file1, "fastq") || hasExtension(file1, "fq")) { - checkNumberOfItem(row, 7) - file2 = returnFile(row[6]) - if (!hasExtension(file2, "fastq.gz") && !hasExtension(file2, "fq.gz") && !hasExtension(file2, "fastq") && !hasExtension(file2, "fq")) exit 1, "File: ${file2} has the wrong extension. See --help for more information" - if (hasExtension(file1, "fastq") || hasExtension(file1, "fq") || hasExtension(file2, "fastq") || hasExtension(file2, "fq")) { + if (has_extension(file1, "fastq.gz") || has_extension(file1, "fq.gz") || has_extension(file1, "fastq") || has_extension(file1, "fq")) { + check_number_of_item(row, 7) + file2 = return_file(row[6]) + if (!has_extension(file2, "fastq.gz") && !has_extension(file2, "fq.gz") && !has_extension(file2, "fastq") && !has_extension(file2, "fq")) exit 1, "File: ${file2} has the wrong extension. See --help for more information" + if (has_extension(file1, "fastq") || has_extension(file1, "fq") || has_extension(file2, "fastq") || has_extension(file2, "fq")) { exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint." } } - else if (hasExtension(file1, "bam")) checkNumberOfItem(row, 6) + else if (has_extension(file1, "bam")) check_number_of_item(row, 6) else "No recognisable extention for input file: ${file1}" [idPatient, gender, status, idSample, idRun, file1, file2] @@ -184,18 +184,18 @@ def extractFastq(tsvFile) { // Channeling the TSV file containing mpileup // Format is: "subject gender status sample pileup" -def extractPileup(tsvFile) { +def extract_pileup(tsvFile) { Channel.from(tsvFile) .splitCsv(sep: '\t') .map { row -> - checkNumberOfItem(row, 5) + check_number_of_item(row, 5) def idPatient = row[0] def gender = row[1] - def status = returnStatus(row[2].toInteger()) + def status = return_status(row[2].toInteger()) def idSample = row[3] - def mpileup = returnFile(row[4]) + def mpileup = return_file(row[4]) - if (!hasExtension(mpileup, "pileup")) exit 1, "File: ${mpileup} has the wrong extension. See --help for more information" + if (!has_extension(mpileup, "pileup")) exit 1, "File: ${mpileup} has the wrong extension. See --help for more information" return [idPatient, gender, status, idSample, mpileup] } @@ -203,29 +203,29 @@ def extractPileup(tsvFile) { // Channeling the TSV file containing Recalibration Tables. // Format is: "subject gender status sample bam bai recalTable" -def extractRecal(tsvFile) { +def extract_recal(tsvFile) { Channel.from(tsvFile) .splitCsv(sep: '\t') .map { row -> - checkNumberOfItem(row, 7) + check_number_of_item(row, 7) def idPatient = row[0] def gender = row[1] - def status = returnStatus(row[2].toInteger()) + def status = return_status(row[2].toInteger()) def idSample = row[3] - def bamFile = returnFile(row[4]) - def baiFile = returnFile(row[5]) - def recalTable = returnFile(row[6]) + def bamFile = return_file(row[4]) + def baiFile = return_file(row[5]) + def recalTable = return_file(row[6]) - if (!hasExtension(bamFile, "bam")) exit 1, "File: ${bamFile} has the wrong extension. See --help for more information" - if (!hasExtension(baiFile, "bai")) exit 1, "File: ${baiFile} has the wrong extension. See --help for more information" - if (!hasExtension(recalTable, "recal.table")) exit 1, "File: ${recalTable} has the wrong extension. See --help for more information" + if (!has_extension(bamFile, "bam")) exit 1, "File: ${bamFile} has the wrong extension. See --help for more information" + if (!has_extension(baiFile, "bai")) exit 1, "File: ${baiFile} has the wrong extension. See --help for more information" + if (!has_extension(recalTable, "recal.table")) exit 1, "File: ${recalTable} has the wrong extension. See --help for more information" [idPatient, gender, status, idSample, bamFile, baiFile, recalTable] } } // Parse first line of a FASTQ file, return the flowcell id and lane number. -def flowcellLaneFromFastq(path) { +def flowcellLane_from_fastq(path) { // expected format: // xx:yy:FLOWCELLID:LANE:... (seven fields) // or @@ -252,24 +252,24 @@ def flowcellLaneFromFastq(path) { } // Check file extension -def hasExtension(it, extension) { +def has_extension(it, extension) { it.toString().toLowerCase().endsWith(extension.toLowerCase()) } // Return file if it exists -def returnFile(it) { +def return_file(it) { if (!file(it).exists()) exit 1, "Missing file in TSV file: ${it}, see --help for more information" return file(it) } // Remove .ann .gz and .vcf extension from a VCF file -def reduceVCF(file) { +def reduce_vcf(file) { return file.fileName.toString().minus(".ann").minus(".vcf").minus(".gz") } // Return status [0,1] // 0 == Normal, 1 == Tumor -def returnStatus(it) { +def return_status(it) { if (!(it in [0, 1])) exit 1, "Status is not recognized in TSV file: ${it}, see --help for more information" return it } From a6168b14773ca1570269982514e99f65f5ac60e4 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 00:08:56 +0200 Subject: [PATCH 026/200] code polishing --- modules/local/build_intervals.nf | 10 ++++----- modules/local/create_intervals_bed.nf | 9 ++++---- modules/local/gatk_dict.nf | 2 +- modules/local/get_software_versions.nf | 2 +- modules/local/output_documentation.nf | 8 +++---- modules/local/trim_galore.nf | 11 +++++----- modules/nf-core/bwamem2_index.nf | 2 +- modules/nf-core/fastqc.nf | 7 +++---- modules/nf-core/htslib_tabix.nf | 4 ++-- modules/nf-core/multiqc.nf | 29 +++++++++++++------------- modules/nf-core/samtools_faidx.nf | 2 +- modules/subworkflows/build_indices.nf | 26 +++++++++++------------ 12 files changed, 52 insertions(+), 60 deletions(-) diff --git a/modules/local/build_intervals.nf b/modules/local/build_intervals.nf index d235885335..ae35588344 100644 --- a/modules/local/build_intervals.nf +++ b/modules/local/build_intervals.nf @@ -1,19 +1,17 @@ process BUILD_INTERVALS { - tag "${fastaFai}" + tag "${fai}" publishDir params.outdir, mode: params.publish_dir_mode, saveAs: {params.save_reference ? "reference_genome/${it}" : null } input: - path file(fastaFai) + path fai output: - path file("${fastaFai.baseName}.bed") - - //when: !(params.intervals) && !('annotate' in step) && !('controlfreec' in step) + path "${fai.baseName}.bed" script: """ - awk -v FS='\t' -v OFS='\t' '{ print \$1, \"0\", \$2 }' ${fastaFai} > ${fastaFai.baseName}.bed + awk -v FS='\t' -v OFS='\t' '{ print \$1, \"0\", \$2 }' ${fai} > ${fai.baseName}.bed """ } \ No newline at end of file diff --git a/modules/local/create_intervals_bed.nf b/modules/local/create_intervals_bed.nf index a5469fecb1..b073dd5b17 100644 --- a/modules/local/create_intervals_bed.nf +++ b/modules/local/create_intervals_bed.nf @@ -1,4 +1,4 @@ -include { hasExtension } from './functions' +include { has_extension } from './functions' process CREATE_INTERVALS_BED { tag "${intervals}" @@ -7,13 +7,12 @@ process CREATE_INTERVALS_BED { path intervals output: - path ('*.bed')//mode flatten - + path ('*.bed') script: // If the interval file is BED format, the fifth column is interpreted to // contain runtime estimates, which is then used to combine short-running jobs - if (hasExtension(intervals, "bed")) + if (has_extension(intervals, "bed")) """ awk -vFS="\t" '{ t = \$5 # runtime estimate @@ -33,7 +32,7 @@ process CREATE_INTERVALS_BED { print \$0 > name }' ${intervals} """ - else if (hasExtension(intervals, "interval_list")) + else if (has_extension(intervals, "interval_list")) """ grep -v '^@' ${intervals} | awk -vFS="\t" '{ name = sprintf("%s_%d-%d", \$1, \$2, \$3); diff --git a/modules/local/gatk_dict.nf b/modules/local/gatk_dict.nf index 70750d4990..9c811940c0 100644 --- a/modules/local/gatk_dict.nf +++ b/modules/local/gatk_dict.nf @@ -8,7 +8,7 @@ process GATK_CREATE_SEQUENCE_DICTIONARY { path fasta output: - path ("${fasta.baseName}.dict") + path "${fasta.baseName}.dict" script: """ diff --git a/modules/local/get_software_versions.nf b/modules/local/get_software_versions.nf index 1b70ecddbc..8c52334839 100644 --- a/modules/local/get_software_versions.nf +++ b/modules/local/get_software_versions.nf @@ -9,7 +9,7 @@ process GET_SOFTWARE_VERSIONS { } output: - path 'software_versions_mqc.yaml', emit: yml + path "software_versions_mqc.yaml", emit: yml path "software_versions.csv", emit: csv script: diff --git a/modules/local/output_documentation.nf b/modules/local/output_documentation.nf index b6d24b0f58..bd3f9cdd4d 100644 --- a/modules/local/output_documentation.nf +++ b/modules/local/output_documentation.nf @@ -5,14 +5,14 @@ process OUTPUT_DOCUMENTATION { publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode input: - path output_docs - path images + path output_docs + path images output: - path "results_description.html" + path "results_description.html" script: """ - markdown_to_html.py $output_docs -o results_description.html + markdown_to_html.py ${output_docs} -o results_description.html """ } diff --git a/modules/local/trim_galore.nf b/modules/local/trim_galore.nf index fa3e69e8c1..d0bf00ff88 100644 --- a/modules/local/trim_galore.nf +++ b/modules/local/trim_galore.nf @@ -5,19 +5,18 @@ process TRIM_GALORE { publishDir "${params.outdir}/Reports/${idSample}/TrimGalore/${idSample}_${idRun}", mode: params.publish_dir_mode, saveAs: {filename -> - if (filename.indexOf("_fastqc") > 0) "FastQC/$filename" - else if (filename.indexOf("trimming_report.txt") > 0) "logs/$filename" + if (filename.indexOf("_fastqc") > 0) "FastQC/${filename}" + else if (filename.indexOf("trimming_report.txt") > 0) "logs/${filename}" else if (params.save_trimmed) filename else null } input: - tuple val(idPatient), val(idSample), val(idRun), file("${idSample}_${idRun}_R1.fastq.gz"), file("${idSample}_${idRun}_R2.fastq.gz") + tuple val(idPatient), val(idSample), val(idRun), path("${idSample}_${idRun}_R1.fastq.gz"), path("${idSample}_${idRun}_R2.fastq.gz") output: - path "*.{html,zip,txt}", emit: report - tuple idPatient, idSample, idRun, file("${idSample}_${idRun}_R1_val_1.fq.gz"), file("${idSample}_${idRun}_R2_val_2.fq.gz") , emit: trimmed_reads - + path "*.{html,zip,txt}", emit: report + tuple idPatient, idSample, idRun, path("${idSample}_${idRun}_R1_val_1.fq.gz"), path("${idSample}_${idRun}_R2_val_2.fq.gz"), emit: trimmed_reads script: // Calculate number of --cores for TrimGalore based on value of task.cpus diff --git a/modules/nf-core/bwamem2_index.nf b/modules/nf-core/bwamem2_index.nf index 0206509753..661b655b7a 100644 --- a/modules/nf-core/bwamem2_index.nf +++ b/modules/nf-core/bwamem2_index.nf @@ -8,7 +8,7 @@ process BWAMEM2_INDEX { path fasta output: - path("${fasta}.*") + path "${fasta}.*" script: """ diff --git a/modules/nf-core/fastqc.nf b/modules/nf-core/fastqc.nf index b7e58261e1..c5408d179e 100644 --- a/modules/nf-core/fastqc.nf +++ b/modules/nf-core/fastqc.nf @@ -1,5 +1,5 @@ process FASTQC { - label 'FastQC' + label 'FASTQC' label 'cpus_2' tag "${idPatient}-${idRun}" @@ -7,12 +7,11 @@ process FASTQC { publishDir "${params.outdir}/Reports/${idSample}/FastQC/${idSample}_${idRun}", mode: params.publish_dir_mode input: - tuple val(idPatient), val(idSample), val(idRun), file("${idSample}_${idRun}_R1.fastq.gz"), file("${idSample}_${idRun}_R2.fastq.gz") - + tuple val(idPatient), val(idSample), val(idRun), path("${idSample}_${idRun}_R1.fastq.gz"), path("${idSample}_${idRun}_R2.fastq.gz") + output: path "*.{html,zip}" - script: """ fastqc -t 2 -q ${idSample}_${idRun}_R1.fastq.gz ${idSample}_${idRun}_R2.fastq.gz diff --git a/modules/nf-core/htslib_tabix.nf b/modules/nf-core/htslib_tabix.nf index 8786ce63f1..fef5ab0cc9 100644 --- a/modules/nf-core/htslib_tabix.nf +++ b/modules/nf-core/htslib_tabix.nf @@ -4,10 +4,10 @@ process HTSLIB_TABIX { container 'quay.io/biocontainers/tabix:0.2.6--ha92aebf_0' input: - path(vcf) + path vcf output: - path("${vcf}.tbi") + path "${vcf}.tbi" script: """ diff --git a/modules/nf-core/multiqc.nf b/modules/nf-core/multiqc.nf index 5daf608196..a97e8daa17 100644 --- a/modules/nf-core/multiqc.nf +++ b/modules/nf-core/multiqc.nf @@ -9,25 +9,24 @@ process MULTIQC { publishDir "${params.outdir}/multiqc", mode: params.publish_dir_mode input: - path multiqc_config - path mqc_custom_config - // TODO nf-core: Add in log files from your new processes for MultiQC to find! - path fastqc - path trim_galore - path software_versions - val workflow_summary + path fastqc + path multiqc_config + path multiqc_custom_config + path software_versions + path trim_galore + val workflow_summary output: - path "*multiqc_report.html" - path "*_data" - path "multiqc_plots" + path "*multiqc_report.html" + path "*_data" + path "multiqc_plots" script: - rtitle = custom_runName ? "--title \"$custom_runName\"" : '' - rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' - custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : '' + title = custom_runName ? "--title \"${custom_runName}\"" : '' + filename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' + custom_config_file = params.multiqc_config ? "--config ${multiqc_custom_config}" : '' """ - echo '$workflow_summary' > workflow_summary_mqc.yaml - multiqc -f $rtitle $rfilename $custom_config_file . + echo '${workflow_summary}' > workflow_summary_mqc.yaml + multiqc -f ${title} ${filename} ${custom_config_file} . """ } diff --git a/modules/nf-core/samtools_faidx.nf b/modules/nf-core/samtools_faidx.nf index fcb2e5cb61..b3eb6fa86b 100644 --- a/modules/nf-core/samtools_faidx.nf +++ b/modules/nf-core/samtools_faidx.nf @@ -8,7 +8,7 @@ process SAMTOOLS_FAIDX { path fasta output: - path ("${fasta}.fai") + path "${fasta}.fai" //when: !(params.fasta_fai) && params.fasta && !('annotate' in step) diff --git a/modules/subworkflows/build_indices.nf b/modules/subworkflows/build_indices.nf index 6af688b36a..ae3e4967d2 100644 --- a/modules/subworkflows/build_indices.nf +++ b/modules/subworkflows/build_indices.nf @@ -6,16 +6,16 @@ // And then initialize channels based on params or indexes that were just built - -include { HTSLIB_TABIX as HTSLIB_TABIX_DBSNP } from '../nf-core/htslib_tabix' -include { HTSLIB_TABIX as HTSLIB_TABIX_GERMLINE_RESOURCE } from '../nf-core/htslib_tabix' -include { HTSLIB_TABIX as HTSLIB_TABIX_KNOWN_INDELS } from '../nf-core/htslib_tabix' -include { HTSLIB_TABIX as HTSLIB_TABIX_PON } from '../nf-core/htslib_tabix' +include { BUILD_INTERVALS } from '../local/build_intervals.nf' include { BWAMEM2_INDEX as BWAMEM2_INDEX } from '../nf-core/bwamem2_index.nf' -include { SAMTOOLS_FAIDX as SAMTOOLS_FAIDX } from '../nf-core/samtools_faidx.nf' include { GATK_CREATE_SEQUENCE_DICTIONARY as GATK_CREATE_SEQUENCE_DICTIONARY } from '../local/gatk_dict.nf' -include { BUILD_INTERVALS } from '../local/build_intervals.nf' - +include { + HTSLIB_TABIX as HTSLIB_TABIX_DBSNP; + HTSLIB_TABIX as HTSLIB_TABIX_GERMLINE_RESOURCE; + HTSLIB_TABIX as HTSLIB_TABIX_KNOWN_INDELS; + HTSLIB_TABIX as HTSLIB_TABIX_PON; +} from '../nf-core/htslib_tabix' +include { SAMTOOLS_FAIDX as SAMTOOLS_FAIDX } from '../nf-core/samtools_faidx.nf' workflow BUILD_INDICES{ take: @@ -49,19 +49,17 @@ workflow BUILD_INDICES{ if (!(params.pon_index) && params.pon && ('tnscope' in tools || 'mutect2' in tools)) HTSLIB_TABIX_PON(ch_pon) - if (!(params.intervals) && !('annotate' in step) && !('controlfreec' in step)){ - ch_fai = params.fasta_fai ? Channel.value(file(params.fasta_fai)) : SAMTOOLS_FAIDX.out - ch_fai.dump(tag: 'ch_fai') - BUILD_INTERVALS(ch_fai) + if (!(params.intervals) && !('annotate' in step) && !('controlfreec' in step)) { + BUILD_INTERVALS(SAMTOOLS_FAIDX.out) } emit: bwa_built = BWAMEM2_INDEX.out + dbsnp_tbi = HTSLIB_TABIX_DBSNP.out dictBuilt = GATK_CREATE_SEQUENCE_DICTIONARY.out fai_built = SAMTOOLS_FAIDX.out - dbsnp_tbi = HTSLIB_TABIX_DBSNP.out germline_resource_tbi = HTSLIB_TABIX_GERMLINE_RESOURCE.out + intervalBuilt = BUILD_INTERVALS.out known_indels_tbi = HTSLIB_TABIX_KNOWN_INDELS.out pon_tbi = HTSLIB_TABIX_PON.out - intervalBuilt = BUILD_INTERVALS.out } From 510a585681578c6ed520d60eaa2c134cb50a4221 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 00:10:55 +0200 Subject: [PATCH 027/200] add BWAMEM2_MEM module --- main.nf | 687 ++++++++++++++++++++++++-------------------------------- 1 file changed, 290 insertions(+), 397 deletions(-) diff --git a/main.nf b/main.nf index 0c6759fd90..3705e1f3a4 100644 --- a/main.nf +++ b/main.nf @@ -36,15 +36,17 @@ if (params.help) { INCLUDE SAREK FUNCTIONS ================================================================================ */ -include { hasExtension; - defineStepList; - extractFastq; - extractInfos; - defineToolList; - checkParameterList; - extractBam; - extractFastqFromDir; - checkParameterExistence } from './modules/local/functions' +include { + check_parameter_existence; + check_parameter_list; + define_step_list; + define_tool_list; + extract_bam; + extract_fastq; + extract_fastq_from_dir; + extract_infos; + has_extension +} from './modules/local/functions' /* ================================================================================ @@ -55,6 +57,7 @@ include { hasExtension; /* * Check parameters */ + Checks.aws_batch(workflow, params) // Check AWS batch settings Checks.hostname(workflow, params, log) // Check the hostnames against configured profiles @@ -62,10 +65,11 @@ Checks.hostname(workflow, params, log) // Check the hostnames against configured * MultiQC * Stage config files */ -ch_multiqc_config = file("$baseDir/assets/multiqc_config.yaml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() -ch_output_docs = file("$baseDir/docs/output.md", checkIfExists: true) -ch_output_docs_images = file("$baseDir/docs/images/", checkIfExists: true) + +multiqc_config = file("$baseDir/assets/multiqc_config.yaml", checkIfExists: true) +multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() +output_docs = file("$baseDir/docs/output.md", checkIfExists: true) +output_docs_images = file("$baseDir/docs/images/", checkIfExists: true) // // Check if genome exists in the config file // if (params.genomes && !params.genomes.containsKey(params.genome) && !params.igenomes_ignore) { @@ -74,27 +78,27 @@ ch_output_docs_images = file("$baseDir/docs/images/", checkIfExists: true) // exit 1, "The provided genome '${params.genome}' is not available in the genomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" // } -stepList = defineStepList() +step_list = define_step_list() step = params.step ? params.step.toLowerCase().replaceAll('-', '').replaceAll('_', '') : '' // Handle deprecation if (step == 'preprocessing') step = 'mapping' if (step.contains(',')) exit 1, 'You can choose only one step, see --help for more information' -if (!checkParameterExistence(step, stepList)) exit 1, "Unknown step ${step}, see --help for more information" +if (!check_parameter_existence(step, step_list)) exit 1, "Unknown step ${step}, see --help for more information" -toolList = defineToolList() +tool_list = define_tool_list() tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] if (step == 'controlfreec') tools = ['controlfreec'] -if (!checkParameterList(tools, toolList)) exit 1, 'Unknown tool(s), see --help for more information' +if (!check_parameter_list(tools, tool_list)) exit 1, 'Unknown tool(s), see --help for more information' -// skipQClist = defineSkipQClist() -// skipQC = params.skip_qc ? params.skip_qc == 'all' ? skipQClist : params.skip_qc.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] -// if (!checkParameterList(skipQC, skipQClist)) exit 1, 'Unknown QC tool(s), see --help for more information' +// skip__qc_list = define_skip_qc_list() +// skipQC = params.skip_qc ? params.skip_qc == 'all' ? skip__qc_list : params.skip_qc.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] +// if (!check_parameter_list(skipQC, skip__qc_list)) exit 1, 'Unknown QC tool(s), see --help for more information' -// annoList = defineAnnoList() -// annotateTools = params.annotate_tools ? params.annotate_tools.split(',').collect{it.trim().toLowerCase().replaceAll('-', '')} : [] -// if (!checkParameterList(annotateTools,annoList)) exit 1, 'Unknown tool(s) to annotate, see --help for more information' +// anno_list = define_anno_list() +// annotate_tools = params.annotate_tools ? params.annotate_tools.split(',').collect{it.trim().toLowerCase().replaceAll('-', '')} : [] +// if (!check_parameter_list(annotate_tools,anno_list)) exit 1, 'Unknown tool(s) to annotate, see --help for more information' // // Check parameters // if ((params.ascat_ploidy && !params.ascat_purity) || (!params.ascat_ploidy && params.ascat_purity)) exit 1, 'Please specify both --ascat_purity and --ascat_ploidy, or none of them' @@ -102,9 +106,9 @@ if (!checkParameterList(tools, toolList)) exit 1, 'Unknown tool(s), see --help f // Handle input -tsvPath = null -if (params.input && (hasExtension(params.input, "tsv") || hasExtension(params.input, "vcf") || hasExtension(params.input, "vcf.gz"))) tsvPath = params.input -if (params.input && (hasExtension(params.input, "vcf") || hasExtension(params.input, "vcf.gz"))) step = "annotate" +tsv_path = null +if (params.input && (has_extension(params.input, "tsv") || has_extension(params.input, "vcf") || has_extension(params.input, "vcf.gz"))) tsv_path = params.input +if (params.input && (has_extension(params.input, "vcf") || has_extension(params.input, "vcf.gz"))) step = "annotate" // save_bam_mapped = params.skip_markduplicates ? true : params.save_bam_mapped ? true : false @@ -113,65 +117,65 @@ if (params.input && (hasExtension(params.input, "vcf") || hasExtension(params.in if (!params.input && params.sentieon) { switch (step) { case 'mapping': break - case 'recalibrate': tsvPath = "${params.outdir}/Preprocessing/TSV/sentieon_deduped.tsv"; break - case 'variantcalling': tsvPath = "${params.outdir}/Preprocessing/TSV/sentieon_recalibrated.tsv"; break + case 'recalibrate': tsv_path = "${params.outdir}/Preprocessing/TSV/sentieon_deduped.tsv"; break + case 'variantcalling': tsv_path = "${params.outdir}/Preprocessing/TSV/sentieon_recalibrated.tsv"; break case 'annotate': break default: exit 1, "Unknown step ${step}" } } else if (!params.input && !params.sentieon && !params.skip_markduplicates) { switch (step) { case 'mapping': break - case 'preparerecalibration': tsvPath = "${params.outdir}/Preprocessing/TSV/duplicates_marked_no_table.tsv"; break - case 'recalibrate': tsvPath = "${params.outdir}/Preprocessing/TSV/duplicates_marked.tsv"; break - case 'variantcalling': tsvPath = "${params.outdir}/Preprocessing/TSV/recalibrated.tsv"; break - case 'controlfreec': tsvPath = "${params.outdir}/VariantCalling/TSV/control-freec_mpileup.tsv"; break + case 'preparerecalibration': tsv_path = "${params.outdir}/Preprocessing/TSV/duplicates_marked_no_table.tsv"; break + case 'recalibrate': tsv_path = "${params.outdir}/Preprocessing/TSV/duplicates_marked.tsv"; break + case 'variantcalling': tsv_path = "${params.outdir}/Preprocessing/TSV/recalibrated.tsv"; break + case 'controlfreec': tsv_path = "${params.outdir}/VariantCalling/TSV/control-freec_mpileup.tsv"; break case 'annotate': break default: exit 1, "Unknown step ${step}" } } else if (!params.input && !params.sentieon && params.skip_markduplicates) { switch (step) { case 'mapping': break - case 'preparerecalibration': tsvPath = "${params.outdir}/Preprocessing/TSV/mapped.tsv"; break - case 'recalibrate': tsvPath = "${params.outdir}/Preprocessing/TSV/mapped_no_duplicates_marked.tsv"; break - case 'variantcalling': tsvPath = "${params.outdir}/Preprocessing/TSV/recalibrated.tsv"; break - case 'controlfreec': tsvPath = "${params.outdir}/VariantCalling/TSV/control-freec_mpileup.tsv"; break + case 'preparerecalibration': tsv_path = "${params.outdir}/Preprocessing/TSV/mapped.tsv"; break + case 'recalibrate': tsv_path = "${params.outdir}/Preprocessing/TSV/mapped_no_duplicates_marked.tsv"; break + case 'variantcalling': tsv_path = "${params.outdir}/Preprocessing/TSV/recalibrated.tsv"; break + case 'controlfreec': tsv_path = "${params.outdir}/VariantCalling/TSV/control-freec_mpileup.tsv"; break case 'annotate': break default: exit 1, "Unknown step ${step}" } } -inputSample = Channel.empty() -if (tsvPath) { - tsvFile = file(tsvPath) +input_sample = Channel.empty() +if (tsv_path) { + tsv_file = file(tsv_path) switch (step) { - case 'mapping': inputSample = extractFastq(tsvFile); break - case 'preparerecalibration': inputSample = extractBam(tsvFile); break - case 'recalibrate': inputSample = extractRecal(tsvFile); break - case 'variantcalling': inputSample = extractBam(tsvFile); break - case 'controlfreec': inputSample = extractPileup(tsvFile); break + case 'mapping': input_sample = extract_fastq(tsv_file); break + case 'preparerecalibration': input_sample = extract_bam(tsv_file); break + case 'recalibrate': input_sample = extract_recal(tsv_file); break + case 'variantcalling': input_sample = extract_bam(tsv_file); break + case 'controlfreec': input_sample = extract_pileup(tsv_file); break case 'annotate': break default: exit 1, "Unknown step ${step}" } -} else if (params.input && !hasExtension(params.input, "tsv")) { +} else if (params.input && !has_extension(params.input, "tsv")) { log.info "No TSV file" if (step != 'mapping') exit 1, 'No step other than "mapping" supports a directory as an input' log.info "Reading ${params.input} directory" log.warn "[nf-core/sarek] in ${params.input} directory, all fastqs are assuming to be from the same sample, which is assumed to be a germline one" - inputSample = extractFastqFromDir(params.input) - (inputSample, fastqTMP) = inputSample.into(2) - fastqTMP.toList().subscribe onNext: { + input_sample = extract_fastq_from_dir(params.input) + (input_sample, fastq_tmp) = input_sample.into(2) + fastq_tmp.toList().subscribe onNext: { if (it.size() == 0) exit 1, "No FASTQ files found in --input directory '${params.input}'" } - tsvFile = params.input // used in the reports -} else if (tsvPath && step == 'annotate') { - log.info "Annotating ${tsvPath}" + tsv_file = params.input // used in the reports +} else if (tsv_path && step == 'annotate') { + log.info "Annotating ${tsv_path}" } else if (step == 'annotate') { log.info "Trying automatic annotation on files in the VariantCalling/ directory" } else exit 1, 'No sample were defined, see --help' -(genderMap, statusMap, inputSample) = extractInfos(inputSample) +(gender_map, status_map, input_sample) = extract_infos(input_sample) -// inputSample.dump(tag: 'input sample') +// input_sample.dump(tag: 'input sample') /* ================================================================================ @@ -183,7 +187,6 @@ if (tsvPath) { // params.fasta has to be the first one params.fasta = params.genome && !('annotate' in step) ? params.genomes[params.genome].fasta ?: null : null - // The rest can be sorted params.ac_loci = params.genome && 'ascat' in tools ? params.genomes[params.genome].ac_loci ?: null : null params.ac_loci_gc = params.genome && 'ascat' in tools ? params.genomes[params.genome].ac_loci_gc ?: null : null @@ -205,18 +208,18 @@ params.species = params.genome && 'vep' in tools ? params.genomes[params.genome] params.vep_cache_version = params.genome && 'vep' in tools ? params.genomes[params.genome].vep_cache_version ?: null : null // Initialize channels based on params -ch_fasta = params.fasta && !('annotate' in step) ? Channel.value(file(params.fasta)) : "null" -ch_dbsnp = params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? Channel.value(file(params.dbsnp)) : "null" -ch_germline_resource = params.germline_resource && 'mutect2' in tools ? Channel.value(file(params.germline_resource)) : "null" -ch_known_indels = params.known_indels && ('mapping' in step || 'preparerecalibration' in step) ? Channel.value(file(params.known_indels)) : "null" -ch_pon = params.pon ? Channel.value(file(params.pon)) : "null" +fasta = params.fasta && !('annotate' in step) ? Channel.value(file(params.fasta)) : "null" +dbsnp = params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? Channel.value(file(params.dbsnp)) : "null" +germline_resource = params.germline_resource && 'mutect2' in tools ? Channel.value(file(params.germline_resource)) : "null" +known_indels = params.known_indels && ('mapping' in step || 'preparerecalibration' in step) ? Channel.value(file(params.known_indels)) : "null" +pon = params.pon ? Channel.value(file(params.pon)) : "null" ch_ac_loci = params.ac_loci && 'ascat' in tools ? Channel.value(file(params.ac_loci)) : "null" ch_ac_loci_gc = params.ac_loci_gc && 'ascat' in tools ? Channel.value(file(params.ac_loci_gc)) : "null" ch_chr_dir = params.chr_dir && 'controlfreec' in tools ? Channel.value(file(params.chr_dir)) : "null" ch_chr_length = params.chr_length && 'controlfreec' in tools ? Channel.value(file(params.chr_length)) : "null" -ch_fai = params.fasta_fai && !('annotate' in step) ? Channel.value(file(params.fasta_fai)) : "null" -ch_intervals = params.intervals && !params.no_intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : "null" +fai = params.fasta_fai && !('annotate' in step) ? Channel.value(file(params.fasta_fai)) : "null" +intervals = params.intervals && !params.no_intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : "null" ch_mappability = params.mappability && 'controlfreec' in tools ? Channel.value(file(params.mappability)) : "null" ch_snpeff_cache = params.snpeff_cache ? Channel.value(file(params.snpeff_cache)) : "null" @@ -250,7 +253,7 @@ log.info "-\033[2m----------------------------------------------------\033[0m-" // params summary for MultiQC workflow_summary = Schema.params_mqc_summary(summary) -ch_workflow_summary = Channel.value(workflow_summary) +workflow_summary = Channel.value(workflow_summary) if ('mutect2' in tools && !(params.pon)) log.warn "[nf-core/sarek] Mutect2 was requested, but as no panel of normals were given, results will not be optimal" if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works if Sentieon is available where nf-core/sarek is run" @@ -260,37 +263,42 @@ if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works INCLUDE LOCAL PIPELINE MODULES ================================================================================ */ -include { OUTPUT_DOCUMENTATION } from './modules/local/output_documentation' params(params) + +include { BWAMEM2_MEM } from './modules/local/bwamem2_mem.nf' addParams(params) +include { CREATE_INTERVALS_BED } from './modules/local/create_intervals_bed' addParams(params) include { GET_SOFTWARE_VERSIONS } from './modules/local/get_software_versions' params(params) +include { OUTPUT_DOCUMENTATION } from './modules/local/output_documentation' params(params) +include { TRIM_GALORE } from './modules/local/trim_galore.nf' addParams(params) /* ================================================================================ INCLUDE nf-core PIPELINE MODULES ================================================================================ */ -include { FASTQC } from './modules/nf-core/fastqc' params(params) + +include { FASTQC } from './modules/nf-core/fastqc' params(params) include { MULTIQC } from './modules/nf-core/multiqc' params(params) // PREPARING CHANNELS FOR PREPROCESSING AND QC -// inputBam = Channel.empty() -// inputPairReads = Channel.empty() +// input_bam = Channel.empty() +// input_pair_reads = Channel.empty() // if (step in ['preparerecalibration', 'recalibrate', 'variantcalling', 'controlfreec', 'annotate']) { -// inputBam.close() -// inputPairReads.close() -// } else inputSample.branch(inputPairReads, inputBam) {hasExtension(it[3], "bam") ? 1 : 0} +// input_bam.close() +// input_pair_reads.close() +// } else input_sample.branch(input_pair_reads, input_bam) {has_extension(it[3], "bam") ? 1 : 0} -// (inputBam, inputBamFastQC) = inputBam.into(2) +// (input_bam, input_bam_fastqc) = input_bam.into(2) // // Removing inputFile2 which is null in case of uBAM -// inputBamFastQC = inputBamFastQC.map { +// input_bam_fastqc = input_bam_fastqc.map { // idPatient, idSample, idRun, inputFile1, inputFile2 -> // [idPatient, idSample, idRun, inputFile1] // } // if (params.split_fastq){ -// inputPairReads = inputPairReads +// input_pair_reads = input_pair_reads // // newly splitfastq are named based on split, so the name is easier to catch // .splitFastq(by: params.split_fastq, compress:true, file:"split", pe:true) // .map {idPatient, idSample, idRun, reads1, reads2 -> @@ -307,9 +315,9 @@ include { MULTIQC } from './modules/nf-core/multiqc' params(params) // [idPatient, idSample, newIdRun, reads1, reads2]} //} -// inputPairReads.dump(tag:'INPUT') +// input_pair_reads.dump(tag:'INPUT') -// (inputPairReads, inputPairReadsTrimGalore, inputPairReadsFastQC) = inputPairReads.into(3) +// (input_pair_reads, input_pair_readstrimgalore, input_pair_readsfastqc) = input_pair_reads.into(3) /* @@ -317,38 +325,34 @@ include { MULTIQC } from './modules/nf-core/multiqc' params(params) RUN THE WORKFLOW ================================================================================ */ -include { BUILD_INDICES } from './modules/subworkflows/build_indices' addParams(params) -include { CREATE_INTERVALS_BED } from './modules/local/create_intervals_bed' addParams(params) -include { TRIM_GALORE } from './modules/local/trim_galore.nf' addParams(params) -ch_fasta.dump(tag: 'ch_fasta') +include { BUILD_INDICES } from './modules/subworkflows/build_indices' addParams(params) +fasta.dump(tag: 'fasta') workflow { - // BUILD INDICES - BUILD_INDICES( - ch_dbsnp, - ch_fasta, - ch_germline_resource, - ch_known_indels, - ch_pon, + dbsnp, + fasta, + germline_resource, + known_indels, + pon, step) - ch_bwa = params.bwa ? Channel.value(file(params.bwa)) : BUILD_INDICES.out.bwa_built - ch_dict = params.dict ? Channel.value(file(params.dict)) : BUILD_INDICES.out.dictBuilt - ch_fai = params.fasta_fai ? Channel.value(file(params.fasta_fai)) : BUILD_INDICES.out.fai_built - ch_dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? Channel.value(file(params.dbsnp_index)) : BUILD_INDICES.out.dbsnp_tbi : "null" - ch_germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? Channel.value(file(params.germline_resource_index)) : BUILD_INDICES.out.germline_resource_tbi : "null" - ch_known_indels_tbi = params.known_indels ? params.known_indels_index ? Channel.value(file(params.known_indels_index)) : BUILD_INDICES.out.known_indels_tbi.collect() : "null" - ch_pon_tbi = params.pon ? params.pon_index ? Channel.value(file(params.pon_index)) : BUILD_INDICES.out.pon_tbi : "null" - ch_intervals = params.no_intervals ? "null" : params.intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : BUILD_INDICES.out.intervalBuilt - ch_intervals.dump(tag: 'ch_intervals') + bwa = params.bwa ? Channel.value(file(params.bwa)) : BUILD_INDICES.out.bwa_built + dict = params.dict ? Channel.value(file(params.dict)) : BUILD_INDICES.out.dictBuilt + fai = params.fasta_fai ? Channel.value(file(params.fasta_fai)) : BUILD_INDICES.out.fai_built + dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? Channel.value(file(params.dbsnp_index)) : BUILD_INDICES.out.dbsnp_tbi : "null" + germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? Channel.value(file(params.germline_resource_index)) : BUILD_INDICES.out.germline_resource_tbi : "null" + known_indels_tbi = params.known_indels ? params.known_indels_index ? Channel.value(file(params.known_indels_index)) : BUILD_INDICES.out.known_indels_tbi.collect() : "null" + pon_tbi = params.pon ? params.pon_index ? Channel.value(file(params.pon_index)) : BUILD_INDICES.out.pon_tbi : "null" + intervals = params.no_intervals ? "null" : params.intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : BUILD_INDICES.out.intervalBuilt + intervals.dump(tag: 'intervals') // PREPROCESSING if((!params.no_intervals) && step != 'annotate') - CREATE_INTERVALS_BED(ch_intervals) + CREATE_INTERVALS_BED(intervals) // BED INTERVAL CHANNEL TRANSFORMING ch_bed_intervals = CREATE_INTERVALS_BED.out @@ -376,27 +380,31 @@ workflow { } //if(!('fastqc' in skipQC)) - FASTQC(inputSample) - - if(params.trim_fastq) - TRIM_GALORE(inputSample) + FASTQC(input_sample) + + if(params.trim_fastq) { + TRIM_GALORE(input_sample) + BWAMEM2_MEM(TRIM_GALORE.out.trimmed_reads, bwa, fasta, fai) + } + else { + BWAMEM2_MEM(input_sample, bwa, fasta, fai) + } OUTPUT_DOCUMENTATION( - ch_output_docs, - ch_output_docs_images) + output_docs, + output_docs_images) GET_SOFTWARE_VERSIONS() MULTIQC( - ch_multiqc_config, - ch_multiqc_custom_config.collect().ifEmpty([]), - FASTQC.out.collect().ifEmpty([]), - TRIM_GALORE.out.report.collect().ifEmpty([]), - GET_SOFTWARE_VERSIONS.out.yml.collect(), - ch_workflow_summary) + FASTQC.out.ifEmpty([]), + multiqc_config, + multiqc_custom_config.ifEmpty([]), + GET_SOFTWARE_VERSIONS.out.yml, + TRIM_GALORE.out.report.ifEmpty([]), + workflow_summary) } - /* ================================================================================ SEND COMPLETION EMAIL @@ -408,8 +416,6 @@ workflow.onComplete { Completion.summary(workflow, params, log) } - - // /* // ================================================================================ // PREPROCESSING @@ -434,7 +440,7 @@ workflow.onComplete { // publishDir "${params.outdir}/Reports/${idSample}/FastQC/${idSample}_${idRun}", mode: params.publish_dir_mode // input: -// set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") from inputBamFastQC +// set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") from input_bam_fastqc // output: // file("*.{html,zip}") into fastQCBAMReport @@ -449,112 +455,19 @@ workflow.onComplete { // fastQCReport = fastQCFQReport.mix(fastQCBAMReport) -// process TrimGalore { -// label 'TrimGalore' - -// tag "${idPatient}-${idRun}" - -// publishDir "${params.outdir}/Reports/${idSample}/TrimGalore/${idSample}_${idRun}", mode: params.publish_dir_mode, -// saveAs: {filename -> -// if (filename.indexOf("_fastqc") > 0) "FastQC/$filename" -// else if (filename.indexOf("trimming_report.txt") > 0) "logs/$filename" -// else if (params.save_trimmed) filename -// else null -// } - -// input: -// set idPatient, idSample, idRun, file("${idSample}_${idRun}_R1.fastq.gz"), file("${idSample}_${idRun}_R2.fastq.gz") from inputPairReadsTrimGalore - -// output: -// file("*.{html,zip,txt}") into trimGaloreReport -// set idPatient, idSample, idRun, file("${idSample}_${idRun}_R1_val_1.fq.gz"), file("${idSample}_${idRun}_R2_val_2.fq.gz") into outputPairReadsTrimGalore - -// when: params.trim_fastq - -// script: -// // Calculate number of --cores for TrimGalore based on value of task.cpus -// // See: https://github.com/FelixKrueger/TrimGalore/blob/master/Changelog.md#version-060-release-on-1-mar-2019 -// // See: https://github.com/nf-core/atacseq/pull/65 -// def cores = 1 -// if (task.cpus) { -// cores = (task.cpus as int) - 4 -// if (cores < 1) cores = 1 -// if (cores > 4) cores = 4 -// } -// c_r1 = params.clip_r1 > 0 ? "--clip_r1 ${params.clip_r1}" : '' -// c_r2 = params.clip_r2 > 0 ? "--clip_r2 ${params.clip_r2}" : '' -// tpc_r1 = params.three_prime_clip_r1 > 0 ? "--three_prime_clip_r1 ${params.three_prime_clip_r1}" : '' -// tpc_r2 = params.three_prime_clip_r2 > 0 ? "--three_prime_clip_r2 ${params.three_prime_clip_r2}" : '' -// nextseq = params.trim_nextseq > 0 ? "--nextseq ${params.trim_nextseq}" : '' -// """ -// trim_galore \ -// --cores ${cores} \ -// --paired \ -// --fastqc \ -// --gzip \ -// ${c_r1} ${c_r2} \ -// ${tpc_r1} ${tpc_r2} \ -// ${nextseq} \ -// ${idSample}_${idRun}_R1.fastq.gz ${idSample}_${idRun}_R2.fastq.gz - -// mv *val_1_fastqc.html "${idSample}_${idRun}_R1.trimmed_fastqc.html" -// mv *val_2_fastqc.html "${idSample}_${idRun}_R2.trimmed_fastqc.html" -// mv *val_1_fastqc.zip "${idSample}_${idRun}_R1.trimmed_fastqc.zip" -// mv *val_2_fastqc.zip "${idSample}_${idRun}_R2.trimmed_fastqc.zip" -// """ -// } - -// if (!params.trim_fastq) inputPairReadsTrimGalore.close() +// if (!params.trim_fastq) input_pair_readstrimgalore.close() // // STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA MEM -// if (params.trim_fastq) inputPairReads = outputPairReadsTrimGalore -// else inputPairReads = inputPairReads.mix(inputBam) +// if (params.trim_fastq) input_pair_reads = outputPairReadsTrimGalore +// else input_pair_reads = input_pair_reads.mix(input_bam) -// inputPairReads = inputPairReads.dump(tag:'INPUT') +// input_pair_reads = input_pair_reads.dump(tag:'INPUT') -// (inputPairReads, input_pair_reads_sentieon) = inputPairReads.into(2) -// if (params.sentieon) inputPairReads.close() +// (input_pair_reads, input_pair_reads_sentieon) = input_pair_reads.into(2) +// if (params.sentieon) input_pair_reads.close() // else input_pair_reads_sentieon.close() -// process MapReads { -// label 'cpus_max' - -// tag "${idPatient}-${idRun}" - -// input: -// set idPatient, idSample, idRun, file(inputFile1), file(inputFile2) from inputPairReads -// file(bwaIndex) from ch_bwa -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai - -// output: -// set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") into bamMapped -// set idPatient, val("${idSample}_${idRun}"), file("${idSample}_${idRun}.bam") into bamMappedBamQC - -// when: !(params.sentieon) - -// script: -// // -K is an hidden option, used to fix the number of reads processed by bwa mem -// // Chunk size can affect bwa results, if not specified, -// // the number of threads can change which can give not deterministic result. -// // cf https://github.com/CCDG/Pipeline-Standardization/blob/master/PipelineStandard.md -// // and https://github.com/gatk-workflows/gatk4-data-processing/blob/8ffa26ff4580df4ac3a5aa9e272a4ff6bab44ba2/processing-for-variant-discovery-gatk4.b37.wgs.inputs.json#L29 -// CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : "" -// readGroup = "@RG\\tID:${idRun}\\t${CN}PU:${idRun}\\tSM:${idSample}\\tLB:${idSample}\\tPL:illumina" -// // adjust mismatch penalty for tumor samples -// status = statusMap[idPatient, idSample] -// extra = status == 1 ? "-B 3" : "" -// convertToFastq = hasExtension(inputFile1, "bam") ? "gatk --java-options -Xmx${task.memory.toGiga()}g SamToFastq --INPUT=${inputFile1} --FASTQ=/dev/stdout --INTERLEAVE=true --NON_PF=true | \\" : "" -// input = hasExtension(inputFile1, "bam") ? "-p /dev/stdin - 2> >(tee ${inputFile1}.bwa.stderr.log >&2)" : "${inputFile1} ${inputFile2}" -// """ -// ${convertToFastq} -// bwa mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${fasta} \ -// ${input} | \ -// samtools sort --threads ${task.cpus} -m 2G - > ${idSample}_${idRun}.bam -// """ -// } - // bamMapped = bamMapped.dump(tag:'Mapped BAM') // // Sort BAM whether they are standalone or should be merged @@ -579,9 +492,9 @@ workflow.onComplete { // input: // set idPatient, idSample, idRun, file(inputFile1), file(inputFile2) from input_pair_reads_sentieon -// file(bwaIndex) from ch_bwa -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai +// file(bwaIndex) from bwa +// file(fasta) from fasta +// file(fastaFai) from fai // output: // set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") into bam_sentieon_mapped @@ -597,7 +510,7 @@ workflow.onComplete { // CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : "" // readGroup = "@RG\\tID:${idRun}\\t${CN}PU:${idRun}\\tSM:${idSample}\\tLB:${idSample}\\tPL:illumina" // // adjust mismatch penalty for tumor samples -// status = statusMap[idPatient, idSample] +// status = status_map[idPatient, idSample] // extra = status == 1 ? "-B 3" : "" // """ // sentieon bwa mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${fasta} \ @@ -703,8 +616,8 @@ workflow.onComplete { // // Creating a TSV file to restart from this step // tsv_bam_indexed.map { idPatient, idSample -> -// gender = genderMap[idPatient] -// status = statusMap[idPatient, idSample] +// gender = gender_map[idPatient] +// status = status_map[idPatient, idSample] // bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" // bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" // "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" @@ -714,8 +627,8 @@ workflow.onComplete { // tsv_bam_indexed_sample // .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> -// status = statusMap[idPatient, idSample] -// gender = genderMap[idPatient] +// status = status_map[idPatient, idSample] +// gender = gender_map[idPatient] // bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" // bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" // ["mapped_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] @@ -757,7 +670,7 @@ workflow.onComplete { // --ASSUME_SORT_ORDER coordinate \ // --CREATE_INDEX true \ // --OUTPUT ${idSample}.md.bam - + // mv ${idSample}.md.bai ${idSample}.md.bam.bai // """ // else @@ -777,8 +690,8 @@ workflow.onComplete { // // Creating a TSV file to restart from this step // tsv_bam_duplicates_marked.map { idPatient, idSample -> -// gender = genderMap[idPatient] -// status = statusMap[idPatient, idSample] +// gender = gender_map[idPatient] +// status = status_map[idPatient, idSample] // bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" // bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" // "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" @@ -788,8 +701,8 @@ workflow.onComplete { // tsv_bam_duplicates_marked_sample // .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> -// status = statusMap[idPatient, idSample] -// gender = genderMap[idPatient] +// status = status_map[idPatient, idSample] +// gender = gender_map[idPatient] // bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" // bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" // ["duplicates_marked_no_table_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] @@ -797,7 +710,7 @@ workflow.onComplete { // if ('markduplicates' in skipQC) duplicates_marked_report.close() -// if (step == 'preparerecalibration') bam_duplicates_marked = inputSample +// if (step == 'preparerecalibration') bam_duplicates_marked = input_sample // bam_duplicates_marked = bam_duplicates_marked.dump(tag:'MD BAM') // duplicates_marked_report = duplicates_marked_report.dump(tag:'MD Report') @@ -828,8 +741,8 @@ workflow.onComplete { // input: // set idPatient, idSample, file(bam), file(bai) from bam_sentieon_mapped_merged_indexed -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai +// file(fasta) from fasta +// file(fastaFai) from fai // output: // set idPatient, idSample, file("${idSample}.deduped.bam"), file("${idSample}.deduped.bam.bai") into bam_sentieon_dedup @@ -873,13 +786,13 @@ workflow.onComplete { // input: // set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamBaseRecalibrator -// file(dbsnp) from ch_dbsnp -// file(dbsnpIndex) from ch_dbsnp_tbi -// file(fasta) from ch_fasta -// file(dict) from ch_dict -// file(fastaFai) from ch_fai -// file(knownIndels) from ch_known_indels -// file(knownIndelsIndex) from ch_known_indels_tbi +// file(dbsnp) from dbsnp +// file(dbsnpIndex) from dbsnp_tbi +// file(fasta) from fasta +// file(dict) from dict +// file(fastaFai) from fai +// file(knownIndels) from known_indels +// file(knownIndelsIndex) from known_indels_tbi // output: // set idPatient, idSample, file("${prefix}${idSample}.recal.table") into tableGatherBQSRReports @@ -959,8 +872,8 @@ workflow.onComplete { // // Create TSV files to restart from this step // if (params.skip_markduplicates) { // recalTableTSV.map { idPatient, idSample -> -// status = statusMap[idPatient, idSample] -// gender = genderMap[idPatient] +// status = status_map[idPatient, idSample] +// gender = gender_map[idPatient] // bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" // bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" // recalTable = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.recal.table" @@ -972,8 +885,8 @@ workflow.onComplete { // recalTableSampleTSV // .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV/") { // idPatient, idSample -> -// status = statusMap[idPatient, idSample] -// gender = genderMap[idPatient] +// status = status_map[idPatient, idSample] +// gender = gender_map[idPatient] // bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" // bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" // recalTable = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.recal.table" @@ -981,8 +894,8 @@ workflow.onComplete { // } // } else { // recalTableTSV.map { idPatient, idSample -> -// status = statusMap[idPatient, idSample] -// gender = genderMap[idPatient] +// status = status_map[idPatient, idSample] +// gender = gender_map[idPatient] // bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" // bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" // recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.recal.table" @@ -995,8 +908,8 @@ workflow.onComplete { // recalTableSampleTSV // .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV/") { // idPatient, idSample -> -// status = statusMap[idPatient, idSample] -// gender = genderMap[idPatient] +// status = status_map[idPatient, idSample] +// gender = gender_map[idPatient] // bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" // bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" // recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.recal.table" @@ -1006,7 +919,7 @@ workflow.onComplete { // bamApplyBQSR = bamMDToJoin.join(recalTable, by:[0,1]) -// if (step == 'recalibrate') bamApplyBQSR = inputSample +// if (step == 'recalibrate') bamApplyBQSR = input_sample // bamApplyBQSR = bamApplyBQSR.dump(tag:'BAM + BAI + RECAL TABLE') @@ -1024,9 +937,9 @@ workflow.onComplete { // input: // set idPatient, idSample, file(bam), file(bai), file(recalibrationReport), file(intervalBed) from bamApplyBQSR -// file(dict) from ch_dict -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai +// file(dict) from dict +// file(fasta) from fasta +// file(fastaFai) from fai // output: // set idPatient, idSample, file("${prefix}${idSample}.recal.bam") into bam_recalibrated_to_merge @@ -1066,13 +979,13 @@ workflow.onComplete { // input: // set idPatient, idSample, file(bam), file(bai) from bam_sentieon_dedup -// file(dbsnp) from ch_dbsnp -// file(dbsnpIndex) from ch_dbsnp_tbi -// file(fasta) from ch_fasta -// file(dict) from ch_dict -// file(fastaFai) from ch_fai -// file(knownIndels) from ch_known_indels -// file(knownIndelsIndex) from ch_known_indels_tbi +// file(dbsnp) from dbsnp +// file(dbsnpIndex) from dbsnp_tbi +// file(fasta) from fasta +// file(dict) from dict +// file(fastaFai) from fai +// file(knownIndels) from known_indels +// file(knownIndelsIndex) from known_indels_tbi // output: // set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bam_sentieon_recal @@ -1116,8 +1029,8 @@ workflow.onComplete { // // Creating a TSV file to restart from this step // tsv_sentieon_deduped.map { idPatient, idSample -> -// gender = genderMap[idPatient] -// status = statusMap[idPatient, idSample] +// gender = gender_map[idPatient] +// status = status_map[idPatient, idSample] // bam = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam" // bai = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam.bai" // table = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.table" @@ -1128,8 +1041,8 @@ workflow.onComplete { // tsv_sentieon_deduped_sample // .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> -// status = statusMap[idPatient, idSample] -// gender = genderMap[idPatient] +// status = status_map[idPatient, idSample] +// gender = gender_map[idPatient] // bam = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam" // bai = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam.bai" // table = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.table" @@ -1138,8 +1051,8 @@ workflow.onComplete { // // Creating a TSV file to restart from this step // tsv_sentieon_recal.map { idPatient, idSample -> -// gender = genderMap[idPatient] -// status = statusMap[idPatient, idSample] +// gender = gender_map[idPatient] +// status = status_map[idPatient, idSample] // bam = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam" // bai = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam.bai" // "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" @@ -1149,8 +1062,8 @@ workflow.onComplete { // tsv_sentieon_recal_sample // .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> -// status = statusMap[idPatient, idSample] -// gender = genderMap[idPatient] +// status = status_map[idPatient, idSample] +// gender = gender_map[idPatient] // bam = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam" // bai = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam.bai" // ["sentieon_recalibrated_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] @@ -1216,8 +1129,8 @@ workflow.onComplete { // // Creating a TSV file to restart from this step // tsv_bam_recalibrated.map { idPatient, idSample -> -// gender = genderMap[idPatient] -// status = statusMap[idPatient, idSample] +// gender = gender_map[idPatient] +// status = status_map[idPatient, idSample] // bam = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam" // bai = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam.bai" // "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" @@ -1228,8 +1141,8 @@ workflow.onComplete { // tsv_bam_recalibrated_sample // .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { // idPatient, idSample -> -// status = statusMap[idPatient, idSample] -// gender = genderMap[idPatient] +// status = status_map[idPatient, idSample] +// gender = gender_map[idPatient] // bam = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam" // bai = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam.bai" // ["recalibrated_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] @@ -1310,8 +1223,8 @@ workflow.onComplete { // // When no knownIndels for mapping, Channel bam_recalibrated is bam_duplicates_marked // if (!params.known_indels && step == 'mapping') bam_recalibrated = bam_duplicates_marked -// // When starting with variant calling, Channel bam_recalibrated is inputSample -// if (step == 'variantcalling') bam_recalibrated = inputSample +// // When starting with variant calling, Channel bam_recalibrated is input_sample +// if (step == 'variantcalling') bam_recalibrated = input_sample // bam_recalibrated = bam_recalibrated.dump(tag:'BAM for Variant Calling') @@ -1340,11 +1253,11 @@ workflow.onComplete { // input: // set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamHaplotypeCaller -// file(dbsnp) from ch_dbsnp -// file(dbsnpIndex) from ch_dbsnp_tbi -// file(dict) from ch_dict -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai +// file(dbsnp) from dbsnp +// file(dbsnpIndex) from dbsnp_tbi +// file(dict) from dict +// file(fasta) from fasta +// file(fastaFai) from fai // output: // set val("HaplotypeCallerGVCF"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.g.vcf") into gvcfHaplotypeCaller @@ -1379,11 +1292,11 @@ workflow.onComplete { // input: // set idPatient, idSample, file(intervalBed), file(gvcf) from gvcfGenotypeGVCFs -// file(dbsnp) from ch_dbsnp -// file(dbsnpIndex) from ch_dbsnp_tbi -// file(dict) from ch_dict -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai +// file(dbsnp) from dbsnp +// file(dbsnpIndex) from dbsnp_tbi +// file(dict) from dict +// file(fasta) from fasta +// file(fastaFai) from fai // output: // set val("HaplotypeCaller"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.vcf") into vcfGenotypeGVCFs @@ -1422,10 +1335,10 @@ workflow.onComplete { // input: // set idPatient, idSample, file(bam), file(bai), file(recal) from bam_sentieon_DNAseq -// file(dbsnp) from ch_dbsnp -// file(dbsnpIndex) from ch_dbsnp_tbi -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai +// file(dbsnp) from dbsnp +// file(dbsnpIndex) from dbsnp_tbi +// file(fasta) from fasta +// file(fastaFai) from fai // output: // set val("SentieonDNAseq"), idPatient, idSample, file("DNAseq_${idSample}.vcf") into vcf_sentieon_DNAseq @@ -1458,10 +1371,10 @@ workflow.onComplete { // input: // set idPatient, idSample, file(bam), file(bai), file(recal) from bam_sentieon_DNAscope -// file(dbsnp) from ch_dbsnp -// file(dbsnpIndex) from ch_dbsnp_tbi -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai +// file(dbsnp) from dbsnp +// file(dbsnpIndex) from dbsnp_tbi +// file(fasta) from fasta +// file(fastaFai) from fai // output: // set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_${idSample}.vcf") into vcf_sentieon_DNAscope @@ -1515,8 +1428,8 @@ workflow.onComplete { // input: // set idPatient, idSample, file(bam), file(bai) from bamStrelkaSingle -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai +// file(fasta) from fasta +// file(fastaFai) from fai // file(targetBED) from ch_target_bed // output: @@ -1562,8 +1475,8 @@ workflow.onComplete { // input: // set idPatient, idSample, file(bam), file(bai) from bamMantaSingle -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai +// file(fasta) from fasta +// file(fastaFai) from fai // file(targetBED) from ch_target_bed // output: @@ -1574,13 +1487,13 @@ workflow.onComplete { // script: // beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" // options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" -// status = statusMap[idPatient, idSample] -// inputbam = status == 0 ? "--bam" : "--tumorBam" +// status = status_map[idPatient, idSample] +// input_bam = status == 0 ? "--bam" : "--tumorBam" // vcftype = status == 0 ? "diploid" : "tumor" // """ // ${beforeScript} // configManta.py \ -// ${inputbam} ${bam} \ +// ${input_bam} ${bam} \ // --reference ${fasta} \ // ${options} \ // --runDir Manta @@ -1617,8 +1530,8 @@ workflow.onComplete { // input: // set idPatient, idSample, file(bam), file(bai) from bamTIDDIT -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai +// file(fasta) from fasta +// file(fastaFai) from fai // output: // set val("TIDDIT"), idPatient, idSample, file("*.vcf.gz"), file("*.tbi") into vcfTIDDIT @@ -1648,15 +1561,15 @@ workflow.onComplete { // tag "${idSample}-${intervalBed.baseName}" // label 'cpus_1' - + // input: // set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamFreebayesSingle -// file(fasta) from ch_fasta +// file(fasta) from fasta // file(fastaFai) from ch_software_versions_yaml - + // output: // set val("FreeBayes"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.vcf") into vcfFreebayesSingle - + // when: 'freebayes' in tools // script: @@ -1686,7 +1599,7 @@ workflow.onComplete { // bamTumor = Channel.create() // bamRecalAll -// .choice(bamTumor, bamNormal) {statusMap[it[0], it[1]] == 0 ? 1 : 0} +// .choice(bamTumor, bamNormal) {status_map[it[0], it[1]] == 0 ? 1 : 0} // // Crossing Normal and Tumor to get a T/N pair for Somatic Variant Calling // // Remapping channel to remove common key idPatient @@ -1707,7 +1620,7 @@ workflow.onComplete { // bam_sentieon_tumor = Channel.create() // bam_sentieon_all -// .choice(bam_sentieon_tumor, bam_sention_normal) {statusMap[it[0], it[1]] == 0 ? 1 : 0} +// .choice(bam_sentieon_tumor, bam_sention_normal) {status_map[it[0], it[1]] == 0 ? 1 : 0} // // Crossing Normal and Tumor to get a T/N pair for Somatic Variant Calling // // Remapping channel to remove common key idPatient @@ -1733,8 +1646,8 @@ workflow.onComplete { // input: // set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(intervalBed) from pairBamFreeBayes -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai +// file(fasta) from fasta +// file(fastaFai) from fai // output: // set val("FreeBayes"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf") into vcfFreeBayes @@ -1771,14 +1684,14 @@ workflow.onComplete { // input: // set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(intervalBed) from pairBamMutect2 -// file(dict) from ch_dict -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai -// file(germlineResource) from ch_germline_resource -// file(germlineResourceIndex) from ch_germline_resource_tbi -// file(intervals) from ch_intervals -// file(pon) from ch_pon -// file(ponIndex) from ch_pon_tbi +// file(dict) from dict +// file(fasta) from fasta +// file(fastaFai) from fai +// file(germlineResource) from germline_resource +// file(germlineResourceIndex) from germline_resource_tbi +// file(intervals) from intervals +// file(pon) from pon +// file(ponIndex) from pon_tbi // output: // set val("Mutect2"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf") into mutect2Output @@ -1820,20 +1733,20 @@ workflow.onComplete { // input: // set idPatient, idSamplePair, file(statsFiles), file(vcf) from mutect2Stats // Actual stats files and corresponding VCF chunks -// file(dict) from ch_dict -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai -// file(germlineResource) from ch_germline_resource -// file(germlineResourceIndex) from ch_germline_resource_tbi -// file(intervals) from ch_intervals +// file(dict) from dict +// file(fasta) from fasta +// file(fastaFai) from fai +// file(germlineResource) from germline_resource +// file(germlineResourceIndex) from germline_resource_tbi +// file(intervals) from intervals // output: // set idPatient, idSamplePair, file("${idSamplePair}.vcf.gz.stats") into mergedStatsFile // when: 'mutect2' in tools -// script: -// stats = statsFiles.collect{ "-stats ${it} " }.join(' ') +// script: +// stats = statsFiles.collect{ "-stats ${it} " }.join(' ') // """ // gatk --java-options "-Xmx${task.memory.toGiga()}g" \ // MergeMutectStats \ @@ -1859,7 +1772,7 @@ workflow.onComplete { // input: // set variantCaller, idPatient, idSample, file(vcf) from vcfConcatenateVCFs -// file(fastaFai) from ch_fai +// file(fastaFai) from fai // file(targetBED) from ch_target_bed // output: @@ -1895,7 +1808,7 @@ workflow.onComplete { // input: // set variantCaller, idPatient, idSample, file(vcf) from mutect2Output -// file(fastaFai) from ch_fai +// file(fastaFai) from fai // file(targetBED) from ch_target_bed // output: @@ -1929,8 +1842,8 @@ workflow.onComplete { // input: // set idPatient, idSampleNormal, idSampleTumor, file(bamNormal), file(baiNormal), file(bamTumor), file(baiTumor), file(intervalBed), file(statsFile) from pairBamPileupSummaries -// file(germlineResource) from ch_germline_resource -// file(germlineResourceIndex) from ch_germline_resource_tbi +// file(germlineResource) from germline_resource +// file(germlineResourceIndex) from germline_resource_tbi // output: // set idPatient, idSampleNormal, idSampleTumor, file("${intervalBed.baseName}_${idSampleTumor}_pileupsummaries.table") into pileupSummaries @@ -1962,7 +1875,7 @@ workflow.onComplete { // input: // set idPatient, idSampleNormal, idSampleTumor, file(pileupSums) from pileupSummaries -// file(dict) from ch_dict +// file(dict) from dict // output: // set idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}_pileupsummaries.table") into mergedPileupFile @@ -1996,13 +1909,13 @@ workflow.onComplete { // input: // set idPatient, idSampleNormal, idSampleTumor, file(bamNormal), file(baiNormal), file(bamTumor), file(baiTumor), file(mergedPileup) from pairBamCalculateContamination - + // output: // set idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("${idSampleTumor}_contamination.table") into contaminationTable // when: 'mutect2' in tools -// script: +// script: // """ // # calculate contamination // gatk --java-options "-Xmx${task.memory.toGiga()}g" \ @@ -2028,12 +1941,12 @@ workflow.onComplete { // input: // set idPatient, idSamplePair, file(unfiltered), file(unfilteredIndex), file(stats), file(contaminationTable) from mutect2CallsToFilter -// file(dict) from ch_dict -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai -// file(germlineResource) from ch_germline_resource -// file(germlineResourceIndex) from ch_germline_resource_tbi -// file(intervals) from ch_intervals +// file(dict) from dict +// file(fasta) from fasta +// file(fastaFai) from fai +// file(germlineResource) from germline_resource +// file(germlineResourceIndex) from germline_resource_tbi +// file(intervals) from intervals // output: // set val("Mutect2"), idPatient, idSamplePair, file("Mutect2_filtered_${idSamplePair}.vcf.gz"), file("Mutect2_filtered_${idSamplePair}.vcf.gz.tbi"), file("Mutect2_filtered_${idSamplePair}.vcf.gz.filteringStats.tsv") into filteredMutect2Output @@ -2064,13 +1977,13 @@ workflow.onComplete { // input: // set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), file(recalNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(recalTumor) from bam_pair_sentieon_TNscope -// file(dict) from ch_dict -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai -// file(dbsnp) from ch_dbsnp -// file(dbsnpIndex) from ch_dbsnp_tbi -// file(pon) from ch_pon -// file(ponIndex) from ch_pon_tbi +// file(dict) from dict +// file(fasta) from fasta +// file(fastaFai) from fai +// file(dbsnp) from dbsnp +// file(dbsnpIndex) from dbsnp_tbi +// file(pon) from pon +// file(ponIndex) from pon_tbi // output: // set val("SentieonTNscope"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("*.vcf") into vcf_sentieon_TNscope @@ -2132,9 +2045,9 @@ workflow.onComplete { // input: // set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamStrelka -// file(dict) from ch_dict -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai +// file(dict) from dict +// file(fasta) from fasta +// file(fastaFai) from fai // file(targetBED) from ch_target_bed // output: @@ -2181,8 +2094,8 @@ workflow.onComplete { // input: // set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamManta -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai +// file(fasta) from fasta +// file(fastaFai) from fai // file(targetBED) from ch_target_bed // output: @@ -2247,9 +2160,9 @@ workflow.onComplete { // input: // set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(mantaCSI), file(mantaCSIi) from pairBamStrelkaBP -// file(dict) from ch_dict -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai +// file(dict) from dict +// file(fasta) from fasta +// file(fastaFai) from fai // file(targetBED) from ch_target_bed // output: @@ -2295,7 +2208,7 @@ workflow.onComplete { // input: // set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamCNVkit // file(targetBED) from ch_target_bed -// file(fasta) from ch_fasta +// file(fasta) from fasta // output: // set idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}*"), file("${idSampleNormal}*") into cnvkitOut @@ -2313,7 +2226,7 @@ workflow.onComplete { // --output-reference output_reference.cnn \ // --output-dir ./ \ // --diagram \ -// --scatter +// --scatter // """ // } @@ -2327,8 +2240,8 @@ workflow.onComplete { // tag "${fasta}" // input: -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai +// file(fasta) from fasta +// file(fastaFai) from fai // output: // file "microsatellites.list" into msi_scan_ch @@ -2384,9 +2297,9 @@ workflow.onComplete { // input: // set idPatient, idSample, file(bam), file(bai) from bamAscat // file(acLoci) from ch_ac_loci -// file(dict) from ch_dict -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai +// file(dict) from dict +// file(fasta) from fasta +// file(fastaFai) from fai // output: // set idPatient, idSample, file("${idSample}.alleleCount") into alleleCounterOut @@ -2407,7 +2320,7 @@ workflow.onComplete { // alleleCountOutTumor = Channel.create() // alleleCounterOut -// .choice(alleleCountOutTumor, alleleCountOutNormal) {statusMap[it[0], it[1]] == 0 ? 1 : 0} +// .choice(alleleCountOutTumor, alleleCountOutNormal) {status_map[it[0], it[1]] == 0 ? 1 : 0} // alleleCounterOut = alleleCountOutNormal.combine(alleleCountOutTumor, by:0) @@ -2437,7 +2350,7 @@ workflow.onComplete { // when: 'ascat' in tools // script: -// gender = genderMap[idPatient] +// gender = gender_map[idPatient] // """ // convertAlleleCounts.r ${idSampleTumor} ${alleleCountTumor} ${idSampleNormal} ${alleleCountNormal} ${gender} // """ @@ -2464,7 +2377,7 @@ workflow.onComplete { // when: 'ascat' in tools // script: -// gender = genderMap[idPatient] +// gender = gender_map[idPatient] // purity_ploidy = (params.ascat_purity && params.ascat_ploidy) ? "--purity ${params.ascat_purity} --ploidy ${params.ascat_ploidy}" : "" // """ // for f in *BAF *LogR; do sed 's/chr//g' \$f > tmpFile; mv tmpFile \$f;done @@ -2495,8 +2408,8 @@ workflow.onComplete { // input: // set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamMpileup -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai +// file(fasta) from fasta +// file(fastaFai) from fai // output: // set idPatient, idSample, file("${prefix}${idSample}.pileup") into mpileupMerge @@ -2521,8 +2434,8 @@ workflow.onComplete { // // Creating a TSV file to restart from this step // tsv_mpileup.map { idPatient, idSample -> -// gender = genderMap[idPatient] -// status = statusMap[idPatient, idSample] +// gender = gender_map[idPatient] +// status = status_map[idPatient, idSample] // mpileup = "${params.outdir}/VariantCalling/${idSample}/Control-FREEC/${idSample}.pileup" // "${idPatient}\t${gender}\t${status}\t${idSample}\t${mpileup}\n" // }.collectFile( @@ -2532,8 +2445,8 @@ workflow.onComplete { // tsv_mpileup_sample // .collectFile(storeDir: "${params.outdir}/VariantCalling/TSV") { // idPatient, idSample -> -// status = statusMap[idPatient, idSample] -// gender = genderMap[idPatient] +// status = status_map[idPatient, idSample] +// gender = gender_map[idPatient] // mpileup = "${params.outdir}/VariantCalling/${idSample}/Control-FREEC/${idSample}.pileup" // ["control-freec_mpileup_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${mpileup}\n"] // } @@ -2576,10 +2489,10 @@ workflow.onComplete { // mpileupOutNormal = Channel.create() // mpileupOutTumor = Channel.create() -// if (step == 'controlfreec') mpileupOut = inputSample +// if (step == 'controlfreec') mpileupOut = input_sample // mpileupOut -// .choice(mpileupOutTumor, mpileupOutNormal) {statusMap[it[0], it[1]] == 0 ? 1 : 0} +// .choice(mpileupOutTumor, mpileupOutNormal) {status_map[it[0], it[1]] == 0 ? 1 : 0} // mpileupOut = mpileupOutNormal.combine(mpileupOutTumor, by:0) @@ -2604,10 +2517,10 @@ workflow.onComplete { // file(chrDir) from ch_chr_dir // file(mappability) from ch_mappability // file(chrLength) from ch_chr_length -// file(dbsnp) from ch_dbsnp -// file(dbsnpIndex) from ch_dbsnp_tbi -// file(fasta) from ch_fasta -// file(fastaFai) from ch_fai +// file(dbsnp) from dbsnp +// file(dbsnpIndex) from dbsnp_tbi +// file(fasta) from fasta +// file(fastaFai) from fai // output: // set idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}.pileup_CNVs"), file("${idSampleTumor}.pileup_ratio.txt"), file("${idSampleTumor}.pileup_normal_CNVs"), file("${idSampleTumor}.pileup_normal_ratio.txt"), file("${idSampleTumor}.pileup_BAF.txt"), file("${idSampleNormal}.pileup_BAF.txt") into controlFreecViz @@ -2617,9 +2530,9 @@ workflow.onComplete { // script: // config = "${idSampleTumor}_vs_${idSampleNormal}.config.txt" -// gender = genderMap[idPatient] -// // if we are using coefficientOfVariation, we must delete the window parameter -// // it is "window = 20000" in the default settings, without coefficientOfVariation set, +// gender = gender_map[idPatient] +// // if we are using coefficientOfVariation, we must delete the window parameter +// // it is "window = 20000" in the default settings, without coefficientOfVariation set, // // but we do not like it. Note, it is not written in stone // coeff_or_window = params.cf_window ? "window = ${params.cf_window}" : "coefficientOfVariation = ${params.cf_coeff}" @@ -2833,7 +2746,7 @@ workflow.onComplete { // vcfToAnnotate = Channel.create() // vcfNoAnnotate = Channel.create() -// if (tsvPath == []) { +// if (tsv_path == []) { // // Sarek, by default, annotates all available vcfs that it can find in the VariantCalling directory // // Excluding vcfs from FreeBayes, and g.vcf from HaplotypeCaller // // Basically it's: results/VariantCalling/*/{HaplotypeCaller,Manta,Mutect2,SentieonDNAseq,SentieonDNAscope,SentieonTNscope,Strelka,TIDDIT}/*.vcf.gz @@ -2858,12 +2771,12 @@ workflow.onComplete { // Channel.fromPath("${params.outdir}/VariantCalling/*/TIDDIT/*.vcf.gz") // .flatten().map{vcf -> ['TIDDIT', vcf.minus(vcf.fileName)[-2].toString(), vcf]} // ).choice(vcfToAnnotate, vcfNoAnnotate) { -// annotateTools == [] || (annotateTools != [] && it[0] in annotateTools) ? 0 : 1 +// annotate_tools == [] || (annotate_tools != [] && it[0] in annotate_tools) ? 0 : 1 // } -// } else if (annotateTools == []) { +// } else if (annotate_tools == []) { // // Annotate user-submitted VCFs // // If user-submitted, Sarek assume that the idSample should be assumed automatically -// vcfToAnnotate = Channel.fromPath(tsvPath) +// vcfToAnnotate = Channel.fromPath(tsv_path) // .map{vcf -> ['userspecified', vcf.minus(vcf.fileName)[-2].toString(), vcf]} // } else exit 1, "specify only tools or files to annotate, not both" @@ -3104,10 +3017,10 @@ workflow.onComplete { // publishDir "${params.outdir}/Reports/MultiQC", mode: params.publish_dir_mode // input: -// file (multiqcConfig) from ch_multiqc_config -// file (mqc_custom_config) from ch_multiqc_custom_config.collect().ifEmpty([]) +// file (multiqcConfig) from multiqc_config +// file (mqc_custom_config) from multiqc_custom_config.collect().ifEmpty([]) // file (versions) from ch_software_versions_yaml.collect() -// file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml") +// file workflow_summary from workflow_summary.collectFile(name: "workflow_summary_mqc.yaml") // file ('bamQC/*') from bamQCReport.collect().ifEmpty([]) // file ('BCFToolsStats/*') from bcftoolsReport.collect().ifEmpty([]) // file ('FastQC/*') from fastQCReport.collect().ifEmpty([]) @@ -3135,23 +3048,3 @@ workflow.onComplete { // } // ch_multiqc_report.dump(tag:'MultiQC') - -// // Output Description HTML -// process Output_documentation { -// publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode - -// input: -// file output_docs from ch_output_docs - -// output: -// file "results_description.html" - -// when: !('documentation' in skipQC) - -// script: -// """ -// markdown_to_html.py $output_docs -o results_description.html -// """ -// } - - From 6059e722c4a427e256b8a8f84e55f161a30c7b85 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 09:25:08 +0200 Subject: [PATCH 028/200] more complete headers --- lib/Schema.groovy | 121 +++++++++++++++++++++++++++++++++++++++++----- main.nf | 31 ++++++------ 2 files changed, 124 insertions(+), 28 deletions(-) diff --git a/lib/Schema.groovy b/lib/Schema.groovy index 431eeed6d0..9699004195 100644 --- a/lib/Schema.groovy +++ b/lib/Schema.groovy @@ -82,31 +82,130 @@ class JSON { def Map summary = [:] if (workflow.revision) summary['Pipeline Release'] = workflow.revision summary['Run Name'] = run_name ?: workflow.runName - // TODO nf-core: Report custom parameters here - summary['Input'] = params.input - summary['Fasta File'] = params.fasta summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" + + summary['Input'] = params.input + // summary['Step'] = step + summary['Genome'] = params.genome + // if (params.no_intervals && step != 'annotate') summary['Intervals'] = 'Do not use' + summary['Nucleotides/s'] = params.nucleotides_per_second + if (params.sentieon) summary['Sention'] = "Using Sentieon for Preprocessing and/or Variant Calling" + // if (params.skip_qc) summary['QC tools skipped'] = skip_qc.join(', ') + if (params.target_bed) summary['Target BED'] = params.target_bed + // if (params.tools) summary['Tools'] = tools.join(', ') + if (params.trim_fastq || params.split_fastq) summary['Modify fastqs'] = "trim and/or split" + + if (params.trim_fastq) { + summary['Fastq trim'] = "Fastq trim selected" + summary['Trim R1'] = "${params.clip_r1} bp" + summary['Trim R2'] = "${params.clip_r2} bp" + summary["Trim 3 R1"] = "${params.three_prime_clip_r1} bp" + summary["Trim 3 R2"] = "${params.three_prime_clip_r2} bp" + summary['NextSeq Trim'] = "${params.trim_nextseq} bp" + summary['Saved Trimmed Fastq'] = params.save_trimmed ? 'Yes' : 'No' + } + if (params.split_fastq) summary['Reads in fastq'] = params.split_fastq + + summary['MarkDuplicates'] = "Options" + summary['Java options'] = params.markdup_java_options + summary['GATK Spark'] = params.no_gatk_spark ? 'No' : 'Yes' + + summary['Save BAMs mapped'] = params.save_bam_mapped ? 'Yes' : 'No' + summary['Skip MarkDuplicates'] = params.skip_markduplicates ? 'Yes' : 'No' + + // if ('ascat' in tools) { + summary['ASCAT'] = "Options" + if (params.ascat_purity) summary['purity'] = params.ascat_purity + if (params.ascat_ploidy) summary['ploidy'] = params.ascat_ploidy + // } + + // if ('controlfreec' in tools) { + summary['Control-FREEC'] = "Options" + if (params.cf_window) summary['window'] = params.cf_window + if (params.cf_coeff) summary['coeff of variation'] = params.cf_coeff + if (params.cf_ploidy) summary['ploidy'] = params.cf_ploidy + // } + + // if ('haplotypecaller' in tools) summary['GVCF'] = params.no_gvcf ? 'No' : 'Yes' + // if ('strelka' in tools && 'manta' in tools) summary['Strelka BP'] = params.no_strelka_bp ? 'No' : 'Yes' + // if (params.pon && ('mutect2' in tools || (params.sentieon && 'tnscope' in tools))) summary['Panel of normals'] = params.pon + + // if (params.annotate_tools) summary['Tools to annotate'] = annotate_tools.join(', ') + + if (params.annotation_cache) { + summary['Annotation cache'] = "Enabled" + if (params.snpeff_cache) summary['snpEff cache'] = params.snpeff_cache + if (params.vep_cache) summary['VEP cache'] = params.vep_cache + } + + if (params.cadd_cache) { + summary['CADD cache'] = "Enabled" + if (params.cadd_indels) summary['CADD indels'] = params.cadd_indels + if (params.cadd_wg_snvs) summary['CADD wg snvs'] = params.cadd_wg_snvs + } + + if (params.genesplicer) summary['genesplicer'] = "Enabled" + + if (params.igenomes_base && !params.igenomes_ignore) summary['AWS iGenomes base'] = params.igenomes_base + if (params.igenomes_ignore) summary['AWS iGenomes'] = "Do not use" + if (params.genomes_base && !params.igenomes_ignore) summary['Genomes base'] = params.genomes_base + + summary['Save Reference'] = params.save_reference ? 'Yes' : 'No' + + if (params.ac_loci) summary['Loci'] = params.ac_loci + if (params.ac_loci_gc) summary['Loci GC'] = params.ac_loci_gc + if (params.bwa) summary['BWA indexes'] = params.bwa + if (params.chr_dir) summary['Chromosomes'] = params.chr_dir + if (params.chr_length) summary['Chromosomes length'] = params.chr_length + if (params.dbsnp) summary['dbsnp'] = params.dbsnp + if (params.dbsnp_index) summary['dbsnpIndex'] = params.dbsnp_index + if (params.dict) summary['dict'] = params.dict + if (params.fasta) summary['fasta reference'] = params.fasta + if (params.fasta_fai) summary['fasta index'] = params.fasta_fai + if (params.germline_resource) summary['germline resource'] = params.germline_resource + if (params.germline_resource_index) summary['germline resource index'] = params.germline_resource_index + if (params.intervals) summary['intervals'] = params.intervals + if (params.known_indels) summary['known indels'] = params.known_indels + if (params.known_indels_index) summary['known indels index'] = params.known_indels_index + if (params.mappability) summary['Mappability'] = params.mappability + if (params.snpeff_cache) summary['snpEff cache'] = params.snpeff_cache + if (params.snpeff_db) summary['snpEff DB'] = params.snpeff_db + if (params.species) summary['species'] = params.species + if (params.vep_cache) summary['VEP cache'] = params.vep_cache + if (params.vep_cache_version) summary['VEP cache version'] = params.vep_cache_version + summary['Output dir'] = params.outdir + summary['Publish dir mode'] = params.publish_dir_mode + if (params.sequencing_center) summary['Sequenced by'] = params.sequencing_center + summary['Launch dir'] = workflow.launchDir summary['Working dir'] = workflow.workDir summary['Script dir'] = workflow.projectDir summary['User'] = workflow.userName - if (workflow.profile.contains('awsbatch')) { - summary['AWS Region'] = params.awsregion - summary['AWS Queue'] = params.awsqueue - summary['AWS CLI'] = params.awscli - } + + if (params.multiqc_config) summary['MultiQC config'] = params.multiqc_config + summary['Config Profile'] = workflow.profile - if (params.config_profile_description) summary['Config Profile Descr'] = params.config_profile_description - if (params.config_profile_contact) summary['Config Profile Contact'] = params.config_profile_contact - if (params.config_profile_url) summary['Config Profile URL'] = params.config_profile_url + + if (params.config_profile_description) summary['Description'] = params.config_profile_description + if (params.config_profile_contact) summary['Contact'] = params.config_profile_contact + if (params.config_profile_url) summary['URL'] = params.config_profile_url + summary['Config Files'] = workflow.configFiles.join(', ') + if (params.email || params.email_on_fail) { summary['E-mail Address'] = params.email summary['E-mail on failure'] = params.email_on_fail summary['MultiQC maxsize'] = params.max_multiqc_email_size } + + if (workflow.profile.contains('awsbatch')) { + summary['AWS Region'] = params.awsregion + summary['AWS Queue'] = params.awsqueue + summary['AWS CLI'] = params.awscli + } + return summary } diff --git a/main.nf b/main.nf index 3705e1f3a4..8d7ab3f974 100644 --- a/main.nf +++ b/main.nf @@ -21,21 +21,12 @@ nf-core/sarek: nextflow.preview.dsl = 2 -/* - * Print help message if required - */ -if (params.help) { - def command = "nextflow run nf-core/sarek --input sample.tsv -profile docker" - log.info Headers.nf_core(workflow, params.monochrome_logs) - log.info Schema.params_help("$baseDir/nextflow_schema.json", command) - exit 0 -} - /* ================================================================================ INCLUDE SAREK FUNCTIONS ================================================================================ */ + include { check_parameter_existence; check_parameter_list; @@ -54,17 +45,12 @@ include { ================================================================================ */ -/* - * Check parameters - */ +// Check parameters Checks.aws_batch(workflow, params) // Check AWS batch settings Checks.hostname(workflow, params, log) // Check the hostnames against configured profiles -/* - * MultiQC - * Stage config files - */ +// MultiQC - Stage config files multiqc_config = file("$baseDir/assets/multiqc_config.yaml", checkIfExists: true) multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() @@ -258,6 +244,16 @@ workflow_summary = Channel.value(workflow_summary) if ('mutect2' in tools && !(params.pon)) log.warn "[nf-core/sarek] Mutect2 was requested, but as no panel of normals were given, results will not be optimal" if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works if Sentieon is available where nf-core/sarek is run" +// Print help message if required + +if (params.help) { + def command = "nextflow run nf-core/sarek --input sample.tsv -profile docker" + log.info Headers.nf_core(workflow, params.monochrome_logs) + log.info Schema.params_help("$baseDir/nextflow_schema.json", command) + exit 0 +} + + /* ================================================================================ INCLUDE LOCAL PIPELINE MODULES @@ -410,6 +406,7 @@ workflow { SEND COMPLETION EMAIL ================================================================================ */ + workflow.onComplete { def multiqc_report = [] Completion.email(workflow, params, summary, run_name, baseDir, multiqc_report, log) From 131cd7416f0de3af1631ca1bb250a3340ff811d2 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 09:27:59 +0200 Subject: [PATCH 029/200] code polish --- lib/Schema.groovy | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Schema.groovy b/lib/Schema.groovy index 9699004195..20f49d03e0 100644 --- a/lib/Schema.groovy +++ b/lib/Schema.groovy @@ -159,7 +159,7 @@ class JSON { if (params.chr_dir) summary['Chromosomes'] = params.chr_dir if (params.chr_length) summary['Chromosomes length'] = params.chr_length if (params.dbsnp) summary['dbsnp'] = params.dbsnp - if (params.dbsnp_index) summary['dbsnpIndex'] = params.dbsnp_index + if (params.dbsnp_index) summary['dbsnp index'] = params.dbsnp_index if (params.dict) summary['dict'] = params.dict if (params.fasta) summary['fasta reference'] = params.fasta if (params.fasta_fai) summary['fasta index'] = params.fasta_fai @@ -171,7 +171,7 @@ class JSON { if (params.mappability) summary['Mappability'] = params.mappability if (params.snpeff_cache) summary['snpEff cache'] = params.snpeff_cache if (params.snpeff_db) summary['snpEff DB'] = params.snpeff_db - if (params.species) summary['species'] = params.species + if (params.species) summary['snpEff species'] = params.species if (params.vep_cache) summary['VEP cache'] = params.vep_cache if (params.vep_cache_version) summary['VEP cache version'] = params.vep_cache_version From 4bfc0f909bc3dcc914d5638dc784e9119cb638db Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 09:51:34 +0200 Subject: [PATCH 030/200] remove headers from help message --- main.nf | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/main.nf b/main.nf index 8d7ab3f974..749de268ec 100644 --- a/main.nf +++ b/main.nf @@ -21,6 +21,14 @@ nf-core/sarek: nextflow.preview.dsl = 2 +// Print help message if required + +if (params.help) { + def command = "nextflow run nf-core/sarek -profile docker --input sample.tsv" + log.info Schema.params_help("$baseDir/nextflow_schema.json", command) + exit 0 +} + /* ================================================================================ INCLUDE SAREK FUNCTIONS @@ -244,16 +252,6 @@ workflow_summary = Channel.value(workflow_summary) if ('mutect2' in tools && !(params.pon)) log.warn "[nf-core/sarek] Mutect2 was requested, but as no panel of normals were given, results will not be optimal" if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works if Sentieon is available where nf-core/sarek is run" -// Print help message if required - -if (params.help) { - def command = "nextflow run nf-core/sarek --input sample.tsv -profile docker" - log.info Headers.nf_core(workflow, params.monochrome_logs) - log.info Schema.params_help("$baseDir/nextflow_schema.json", command) - exit 0 -} - - /* ================================================================================ INCLUDE LOCAL PIPELINE MODULES From b43b580161b792ba46bb9fe55ea4ff77d6f0e8fb Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 10:12:15 +0200 Subject: [PATCH 031/200] fix out issue with nextflow 20.06.0-edge --- modules/subworkflows/build_indices.nf | 49 +++++++++++++-------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/modules/subworkflows/build_indices.nf b/modules/subworkflows/build_indices.nf index ae3e4967d2..d5569e63e8 100644 --- a/modules/subworkflows/build_indices.nf +++ b/modules/subworkflows/build_indices.nf @@ -19,47 +19,46 @@ include { SAMTOOLS_FAIDX as SAMTOOLS_FAIDX } from '../nf-core/samtools_faidx.nf' workflow BUILD_INDICES{ take: - ch_dbsnp - ch_fasta - ch_germline_resource - ch_known_indels - ch_pon + dbsnp + fasta + germline_resource + known_indels + pon step main: - if (!(params.bwa) && params.fasta && 'mapping' in step) - BWAMEM2_INDEX(ch_fasta) + if (!(params.bwa) && params.fasta && 'mapping' in step) + BWAMEM2_INDEX(fasta) if (!(params.dict) && params.fasta && !('annotate' in step) && !('controlfreec' in step)) - GATK_CREATE_SEQUENCE_DICTIONARY(ch_fasta) + GATK_CREATE_SEQUENCE_DICTIONARY(fasta) if (!(params.fasta_fai) && params.fasta && !('annotate' in step)) - SAMTOOLS_FAIDX(ch_fasta) + SAMTOOLS_FAIDX(fasta) if (!(params.dbsnp_index) && params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || 'tnscope' in tools)) - HTSLIB_TABIX_DBSNP(ch_dbsnp) - + HTSLIB_TABIX_DBSNP(dbsnp) + if (!(params.germline_resource_index) && params.germline_resource && 'mutect2' in tools) - HTSLIB_TABIX_GERMLINE_RESOURCE(ch_germline_resource) - + HTSLIB_TABIX_GERMLINE_RESOURCE(germline_resource) + if (!(params.known_indels_index) && params.known_indels && ('mapping' in step || 'preparerecalibration' in step)) - HTSLIB_TABIX_KNOWN_INDELS(ch_known_indels) + HTSLIB_TABIX_KNOWN_INDELS(known_indels) if (!(params.pon_index) && params.pon && ('tnscope' in tools || 'mutect2' in tools)) - HTSLIB_TABIX_PON(ch_pon) + HTSLIB_TABIX_PON(pon) - if (!(params.intervals) && !('annotate' in step) && !('controlfreec' in step)) { + if (!(params.intervals) && !('annotate' in step) && !('controlfreec' in step)) BUILD_INTERVALS(SAMTOOLS_FAIDX.out) - } emit: - bwa_built = BWAMEM2_INDEX.out - dbsnp_tbi = HTSLIB_TABIX_DBSNP.out - dictBuilt = GATK_CREATE_SEQUENCE_DICTIONARY.out - fai_built = SAMTOOLS_FAIDX.out - germline_resource_tbi = HTSLIB_TABIX_GERMLINE_RESOURCE.out - intervalBuilt = BUILD_INTERVALS.out - known_indels_tbi = HTSLIB_TABIX_KNOWN_INDELS.out - pon_tbi = HTSLIB_TABIX_PON.out + bwa = BWAMEM2_INDEX + dbsnp_tbi = HTSLIB_TABIX_DBSNP + dict = GATK_CREATE_SEQUENCE_DICTIONARY + fai = SAMTOOLS_FAIDX + germline_resource_tbi = HTSLIB_TABIX_GERMLINE_RESOURCE + intervals = BUILD_INTERVALS + known_indels_tbi = HTSLIB_TABIX_KNOWN_INDELS + pon_tbi = HTSLIB_TABIX_PON } From bb39d07aeec0a24b580aae566a15461742643c64 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 11:35:56 +0200 Subject: [PATCH 032/200] commit all current changes --- main.nf | 29 +++++++++++++++------------ modules/nf-core/multiqc.nf | 2 +- modules/nf-core/samtools_faidx.nf | 2 -- modules/subworkflows/build_indices.nf | 22 ++++++++++---------- 4 files changed, 28 insertions(+), 27 deletions(-) diff --git a/main.nf b/main.nf index 749de268ec..a55408fd12 100644 --- a/main.nf +++ b/main.nf @@ -334,19 +334,19 @@ workflow { pon, step) - bwa = params.bwa ? Channel.value(file(params.bwa)) : BUILD_INDICES.out.bwa_built - dict = params.dict ? Channel.value(file(params.dict)) : BUILD_INDICES.out.dictBuilt - fai = params.fasta_fai ? Channel.value(file(params.fasta_fai)) : BUILD_INDICES.out.fai_built - dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? Channel.value(file(params.dbsnp_index)) : BUILD_INDICES.out.dbsnp_tbi : "null" - germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? Channel.value(file(params.germline_resource_index)) : BUILD_INDICES.out.germline_resource_tbi : "null" - known_indels_tbi = params.known_indels ? params.known_indels_index ? Channel.value(file(params.known_indels_index)) : BUILD_INDICES.out.known_indels_tbi.collect() : "null" - pon_tbi = params.pon ? params.pon_index ? Channel.value(file(params.pon_index)) : BUILD_INDICES.out.pon_tbi : "null" - intervals = params.no_intervals ? "null" : params.intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : BUILD_INDICES.out.intervalBuilt - intervals.dump(tag: 'intervals') + bwa = params.bwa ?: BUILD_INDICES.out.bwa + dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? params.dbsnp_index : BUILD_INDICES.out.dbsnp_tbi : Channel.empty() + dict = params.dict ?: BUILD_INDICES.out.dict + fai = params.fasta_fai ? params.fasta_fai : BUILD_INDICES.out.fai + germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? params.germline_resource_index : BUILD_INDICES.out.germline_resource_tbi : Channel.empty() + // intervals = params.intervals ?: BUILD_INDICES.out.intervals + // intervals = params.no_intervals ? Channel.empty() : params.intervals && !('annotate' in step) ?: BUILD_INDICES.out.intervals + known_indels_tbi = params.known_indels ? params.known_indels_index ? params.known_indels_index : BUILD_INDICES.out.known_indels_tbi.collect() : Channel.empty() + pon_tbi = params.pon ? params.pon_index ? params.pon_index : BUILD_INDICES.out.pon_tbi : Channel.empty() // PREPROCESSING if((!params.no_intervals) && step != 'annotate') - CREATE_INTERVALS_BED(intervals) + CREATE_INTERVALS_BED(BUILD_INDICES.out.intervals) // BED INTERVAL CHANNEL TRANSFORMING ch_bed_intervals = CREATE_INTERVALS_BED.out @@ -373,10 +373,13 @@ workflow { ch_bed_intervals = Channel.from(file("${params.outdir}/no_intervals.bed")) } - //if(!('fastqc' in skipQC)) + // if(!('fastqc' in skipQC)) FASTQC(input_sample) - if(params.trim_fastq) { + input_sample.view() + bwa.view() + + if (params.trim_fastq) { TRIM_GALORE(input_sample) BWAMEM2_MEM(TRIM_GALORE.out.trimmed_reads, bwa, fasta, fai) } @@ -395,7 +398,7 @@ workflow { multiqc_config, multiqc_custom_config.ifEmpty([]), GET_SOFTWARE_VERSIONS.out.yml, - TRIM_GALORE.out.report.ifEmpty([]), + // TRIM_GALORE.out.report.ifEmpty([]), workflow_summary) } diff --git a/modules/nf-core/multiqc.nf b/modules/nf-core/multiqc.nf index a97e8daa17..526bdf62ec 100644 --- a/modules/nf-core/multiqc.nf +++ b/modules/nf-core/multiqc.nf @@ -13,7 +13,7 @@ process MULTIQC { path multiqc_config path multiqc_custom_config path software_versions - path trim_galore + // path trim_galore val workflow_summary output: diff --git a/modules/nf-core/samtools_faidx.nf b/modules/nf-core/samtools_faidx.nf index b3eb6fa86b..3242d7fac3 100644 --- a/modules/nf-core/samtools_faidx.nf +++ b/modules/nf-core/samtools_faidx.nf @@ -10,8 +10,6 @@ process SAMTOOLS_FAIDX { output: path "${fasta}.fai" - //when: !(params.fasta_fai) && params.fasta && !('annotate' in step) - script: """ samtools faidx ${fasta} diff --git a/modules/subworkflows/build_indices.nf b/modules/subworkflows/build_indices.nf index d5569e63e8..44cb4ae8f2 100644 --- a/modules/subworkflows/build_indices.nf +++ b/modules/subworkflows/build_indices.nf @@ -7,15 +7,15 @@ // And then initialize channels based on params or indexes that were just built include { BUILD_INTERVALS } from '../local/build_intervals.nf' -include { BWAMEM2_INDEX as BWAMEM2_INDEX } from '../nf-core/bwamem2_index.nf' -include { GATK_CREATE_SEQUENCE_DICTIONARY as GATK_CREATE_SEQUENCE_DICTIONARY } from '../local/gatk_dict.nf' +include { BWAMEM2_INDEX } from '../nf-core/bwamem2_index.nf' +include { GATK_CREATE_SEQUENCE_DICTIONARY } from '../local/gatk_dict.nf' include { HTSLIB_TABIX as HTSLIB_TABIX_DBSNP; HTSLIB_TABIX as HTSLIB_TABIX_GERMLINE_RESOURCE; HTSLIB_TABIX as HTSLIB_TABIX_KNOWN_INDELS; HTSLIB_TABIX as HTSLIB_TABIX_PON; } from '../nf-core/htslib_tabix' -include { SAMTOOLS_FAIDX as SAMTOOLS_FAIDX } from '../nf-core/samtools_faidx.nf' +include { SAMTOOLS_FAIDX } from '../nf-core/samtools_faidx.nf' workflow BUILD_INDICES{ take: @@ -53,12 +53,12 @@ workflow BUILD_INDICES{ BUILD_INTERVALS(SAMTOOLS_FAIDX.out) emit: - bwa = BWAMEM2_INDEX - dbsnp_tbi = HTSLIB_TABIX_DBSNP - dict = GATK_CREATE_SEQUENCE_DICTIONARY - fai = SAMTOOLS_FAIDX - germline_resource_tbi = HTSLIB_TABIX_GERMLINE_RESOURCE - intervals = BUILD_INTERVALS - known_indels_tbi = HTSLIB_TABIX_KNOWN_INDELS - pon_tbi = HTSLIB_TABIX_PON + bwa = BWAMEM2_INDEX.out + dbsnp_tbi = HTSLIB_TABIX_DBSNP.out + dict = GATK_CREATE_SEQUENCE_DICTIONARY.out + fai = SAMTOOLS_FAIDX.out + germline_resource_tbi = HTSLIB_TABIX_GERMLINE_RESOURCE.out + intervals = BUILD_INTERVALS.out + known_indels_tbi = HTSLIB_TABIX_KNOWN_INDELS.out + pon_tbi = HTSLIB_TABIX_PON.out } From ad514345bd0242cf468f9cc3b6b151c868e66232 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 14:04:46 +0200 Subject: [PATCH 033/200] optional output --- modules/subworkflows/build_indices.nf | 48 ++++++++++++++++++--------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/modules/subworkflows/build_indices.nf b/modules/subworkflows/build_indices.nf index 44cb4ae8f2..f45547cbd5 100644 --- a/modules/subworkflows/build_indices.nf +++ b/modules/subworkflows/build_indices.nf @@ -29,36 +29,52 @@ workflow BUILD_INDICES{ main: if (!(params.bwa) && params.fasta && 'mapping' in step) - BWAMEM2_INDEX(fasta) + result_bwa = BWAMEM2_INDEX(fasta) + else + result_bwa = Channel.empty() if (!(params.dict) && params.fasta && !('annotate' in step) && !('controlfreec' in step)) - GATK_CREATE_SEQUENCE_DICTIONARY(fasta) + result_dict = GATK_CREATE_SEQUENCE_DICTIONARY(fasta) + else + result_dict = Channel.empty() if (!(params.fasta_fai) && params.fasta && !('annotate' in step)) - SAMTOOLS_FAIDX(fasta) + result_fai = SAMTOOLS_FAIDX(fasta) + else + result_fai = Channel.empty() if (!(params.dbsnp_index) && params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || 'tnscope' in tools)) - HTSLIB_TABIX_DBSNP(dbsnp) + result_dbsnp_tbi = HTSLIB_TABIX_DBSNP(dbsnp) + else + result_dbsnp_tbi = Channel.empty() if (!(params.germline_resource_index) && params.germline_resource && 'mutect2' in tools) - HTSLIB_TABIX_GERMLINE_RESOURCE(germline_resource) + result_germline_resource_tbi = HTSLIB_TABIX_GERMLINE_RESOURCE(germline_resource) + else + result_germline_resource_tbi = Channel.empty() if (!(params.known_indels_index) && params.known_indels && ('mapping' in step || 'preparerecalibration' in step)) - HTSLIB_TABIX_KNOWN_INDELS(known_indels) + result_known_indels_tbi = HTSLIB_TABIX_KNOWN_INDELS(known_indels) + else + result_known_indels_tbi = Channel.empty() if (!(params.pon_index) && params.pon && ('tnscope' in tools || 'mutect2' in tools)) - HTSLIB_TABIX_PON(pon) + result_pon_tbi = HTSLIB_TABIX_PON(pon) + else + result_pon_tbi = Channel.empty() if (!(params.intervals) && !('annotate' in step) && !('controlfreec' in step)) - BUILD_INTERVALS(SAMTOOLS_FAIDX.out) + result_intervals = BUILD_INTERVALS(SAMTOOLS_FAIDX.out) + else + result_intervals = Channel.empty() emit: - bwa = BWAMEM2_INDEX.out - dbsnp_tbi = HTSLIB_TABIX_DBSNP.out - dict = GATK_CREATE_SEQUENCE_DICTIONARY.out - fai = SAMTOOLS_FAIDX.out - germline_resource_tbi = HTSLIB_TABIX_GERMLINE_RESOURCE.out - intervals = BUILD_INTERVALS.out - known_indels_tbi = HTSLIB_TABIX_KNOWN_INDELS.out - pon_tbi = HTSLIB_TABIX_PON.out + bwa = result_bwa + dbsnp_tbi = result_dbsnp_tbi + dict = result_dict + fai = result_fai + germline_resource_tbi = result_germline_resource_tbi + intervals = result_intervals + known_indels_tbi = result_known_indels_tbi + pon_tbi = result_pon_tbi } From bbd462b639781be59e3b01cdde18708a3095b9f9 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 14:29:16 +0200 Subject: [PATCH 034/200] trim_galore is optional too --- main.nf | 18 ++++++++---------- modules/nf-core/multiqc.nf | 2 +- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/main.nf b/main.nf index a55408fd12..affb07d088 100644 --- a/main.nf +++ b/main.nf @@ -335,14 +335,13 @@ workflow { step) bwa = params.bwa ?: BUILD_INDICES.out.bwa - dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? params.dbsnp_index : BUILD_INDICES.out.dbsnp_tbi : Channel.empty() + dbsnp_tbi = params.dbsnp ? params.dbsnp_index ?: BUILD_INDICES.out.dbsnp_tbi : Channel.empty() dict = params.dict ?: BUILD_INDICES.out.dict fai = params.fasta_fai ? params.fasta_fai : BUILD_INDICES.out.fai - germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? params.germline_resource_index : BUILD_INDICES.out.germline_resource_tbi : Channel.empty() - // intervals = params.intervals ?: BUILD_INDICES.out.intervals - // intervals = params.no_intervals ? Channel.empty() : params.intervals && !('annotate' in step) ?: BUILD_INDICES.out.intervals - known_indels_tbi = params.known_indels ? params.known_indels_index ? params.known_indels_index : BUILD_INDICES.out.known_indels_tbi.collect() : Channel.empty() - pon_tbi = params.pon ? params.pon_index ? params.pon_index : BUILD_INDICES.out.pon_tbi : Channel.empty() + germline_resource_tbi = params.germline_resource ? params.germline_resource_index ?: BUILD_INDICES.out.germline_resource_tbi : Channel.empty() + intervals = params.no_intervals ? Channel.empty() : params.intervals && !('annotate' in step) ? params.intervals : BUILD_INDICES.out.intervals + known_indels_tbi = params.known_indels ? params.known_indels_index ?: BUILD_INDICES.out.known_indels_tbi.collect() : Channel.empty() + pon_tbi = params.pon ? params.pon_index ?: BUILD_INDICES.out.pon_tbi : Channel.empty() // PREPROCESSING if((!params.no_intervals) && step != 'annotate') @@ -376,14 +375,13 @@ workflow { // if(!('fastqc' in skipQC)) FASTQC(input_sample) - input_sample.view() - bwa.view() - if (params.trim_fastq) { TRIM_GALORE(input_sample) + result_trim_galore = TRIM_GALORE.out.report BWAMEM2_MEM(TRIM_GALORE.out.trimmed_reads, bwa, fasta, fai) } else { + result_trim_galore = Channel.empty() BWAMEM2_MEM(input_sample, bwa, fasta, fai) } @@ -398,7 +396,7 @@ workflow { multiqc_config, multiqc_custom_config.ifEmpty([]), GET_SOFTWARE_VERSIONS.out.yml, - // TRIM_GALORE.out.report.ifEmpty([]), + result_trim_galore.ifEmpty([]), workflow_summary) } diff --git a/modules/nf-core/multiqc.nf b/modules/nf-core/multiqc.nf index 526bdf62ec..a97e8daa17 100644 --- a/modules/nf-core/multiqc.nf +++ b/modules/nf-core/multiqc.nf @@ -13,7 +13,7 @@ process MULTIQC { path multiqc_config path multiqc_custom_config path software_versions - // path trim_galore + path trim_galore val workflow_summary output: From d492db4b6e812c7de3360ba4058b28fa45367da3 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 14:30:41 +0200 Subject: [PATCH 035/200] use intervals --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index affb07d088..673aef0629 100644 --- a/main.nf +++ b/main.nf @@ -345,7 +345,7 @@ workflow { // PREPROCESSING if((!params.no_intervals) && step != 'annotate') - CREATE_INTERVALS_BED(BUILD_INDICES.out.intervals) + CREATE_INTERVALS_BED(intervals) // BED INTERVAL CHANNEL TRANSFORMING ch_bed_intervals = CREATE_INTERVALS_BED.out From ef52290477bcb99518ae99835f15e411d7e92b3e Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 15:11:46 +0200 Subject: [PATCH 036/200] better params --- main.nf | 137 +++++++++++++------------- modules/subworkflows/build_indices.nf | 1 + 2 files changed, 67 insertions(+), 71 deletions(-) diff --git a/main.nf b/main.nf index 673aef0629..123c176fea 100644 --- a/main.nf +++ b/main.nf @@ -178,55 +178,52 @@ if (tsv_path) { */ // Initialize each params in params.genomes, catch the command line first if it was defined -// params.fasta has to be the first one -params.fasta = params.genome && !('annotate' in step) ? params.genomes[params.genome].fasta ?: null : null - -// The rest can be sorted -params.ac_loci = params.genome && 'ascat' in tools ? params.genomes[params.genome].ac_loci ?: null : null -params.ac_loci_gc = params.genome && 'ascat' in tools ? params.genomes[params.genome].ac_loci_gc ?: null : null -params.bwa = params.genome && params.fasta && 'mapping' in step ? params.genomes[params.genome].bwa ?: null : null -params.chr_dir = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].chr_dir ?: null : null -params.chr_length = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].chr_length ?: null : null -params.dbsnp = params.genome && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? params.genomes[params.genome].dbsnp ?: null : null -params.dbsnp_index = params.genome && params.dbsnp ? params.genomes[params.genome].dbsnp_index ?: null : null -params.dict = params.genome && params.fasta ? params.genomes[params.genome].dict ?: null : null -params.fasta_fai = params.genome && params.fasta ? params.genomes[params.genome].fasta_fai ?: null : null -params.germline_resource = params.genome && 'mutect2' in tools ? params.genomes[params.genome].germline_resource ?: null : null -params.germline_resource_index = params.genome && params.germline_resource ? params.genomes[params.genome].germline_resource_index ?: null : null -params.intervals = params.genome && !('annotate' in step) ? params.genomes[params.genome].intervals ?: null : null -params.known_indels = params.genome && ('mapping' in step || 'preparerecalibration' in step) ? params.genomes[params.genome].known_indels ?: null : null -params.known_indels_index = params.genome && params.known_indels ? params.genomes[params.genome].known_indels_index ?: null : null -params.mappability = params.genome && 'controlfreec' in tools ? params.genomes[params.genome].mappability ?: null : null -params.snpeff_db = params.genome && 'snpeff' in tools ? params.genomes[params.genome].snpeff_db ?: null : null -params.species = params.genome && 'vep' in tools ? params.genomes[params.genome].species ?: null : null -params.vep_cache_version = params.genome && 'vep' in tools ? params.genomes[params.genome].vep_cache_version ?: null : null + +params.ac_loci = params.genome ? params.genomes[params.genome].ac_loci ?: false : false +params.ac_loci_gc = params.genome ? params.genomes[params.genome].ac_loci_gc ?: false : false +params.bwa = params.genome ? params.genomes[params.genome].bwa ?: false : false +params.chr_dir = params.genome ? params.genomes[params.genome].chr_dir ?: false : false +params.chr_length = params.genome ? params.genomes[params.genome].chr_length ?: false : false +params.dbsnp = params.genome ? params.genomes[params.genome].dbsnp ?: false : false +params.dbsnp_index = params.genome ? params.genomes[params.genome].dbsnp_index ?: false : false +params.dict = params.genome ? params.genomes[params.genome].dict ?: false : false +params.fasta = params.genome ? params.genomes[params.genome].fasta ?: false : false +params.fasta_fai = params.genome ? params.genomes[params.genome].fasta_fai ?: false : false +params.germline_resource = params.genome ? params.genomes[params.genome].germline_resource ?: false : false +params.germline_resource_index = params.genome ? params.genomes[params.genome].germline_resource_index ?: false : false +params.intervals = params.genome ? params.genomes[params.genome].intervals ?: false : false +params.known_indels = params.genome ? params.genomes[params.genome].known_indels ?: false : false +params.known_indels_index = params.genome ? params.genomes[params.genome].known_indels_index ?: false : false +params.mappability = params.genome ? params.genomes[params.genome].mappability ?: false : false +params.snpeff_db = params.genome ? params.genomes[params.genome].snpeff_db ?: false : false +params.species = params.genome ? params.genomes[params.genome].species ?: false : false +params.vep_cache_version = params.genome ? params.genomes[params.genome].vep_cache_version ?: false : false // Initialize channels based on params -fasta = params.fasta && !('annotate' in step) ? Channel.value(file(params.fasta)) : "null" -dbsnp = params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? Channel.value(file(params.dbsnp)) : "null" -germline_resource = params.germline_resource && 'mutect2' in tools ? Channel.value(file(params.germline_resource)) : "null" -known_indels = params.known_indels && ('mapping' in step || 'preparerecalibration' in step) ? Channel.value(file(params.known_indels)) : "null" -pon = params.pon ? Channel.value(file(params.pon)) : "null" - -ch_ac_loci = params.ac_loci && 'ascat' in tools ? Channel.value(file(params.ac_loci)) : "null" -ch_ac_loci_gc = params.ac_loci_gc && 'ascat' in tools ? Channel.value(file(params.ac_loci_gc)) : "null" -ch_chr_dir = params.chr_dir && 'controlfreec' in tools ? Channel.value(file(params.chr_dir)) : "null" -ch_chr_length = params.chr_length && 'controlfreec' in tools ? Channel.value(file(params.chr_length)) : "null" -fai = params.fasta_fai && !('annotate' in step) ? Channel.value(file(params.fasta_fai)) : "null" -intervals = params.intervals && !params.no_intervals && !('annotate' in step) ? Channel.value(file(params.intervals)) : "null" -ch_mappability = params.mappability && 'controlfreec' in tools ? Channel.value(file(params.mappability)) : "null" - -ch_snpeff_cache = params.snpeff_cache ? Channel.value(file(params.snpeff_cache)) : "null" -ch_snpeff_db = params.snpeff_db ? Channel.value(params.snpeff_db) : "null" -ch_vep_cache_version = params.vep_cache_version ? Channel.value(params.vep_cache_version) : "null" -ch_vep_cache = params.vep_cache ? Channel.value(file(params.vep_cache)) : "null" +fasta = params.fasta && !('annotate' in step) ? params.fasta : Channel.empty() +dbsnp = params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? params.dbsnp : Channel.empty() +germline_resource = params.germline_resource && 'mutect2' in tools ? params.germline_resource : Channel.empty() +known_indels = params.known_indels && ('mapping' in step || 'preparerecalibration' in step) ? params.known_indels : Channel.empty() +pon = params.pon ? params.pon : Channel.empty() + +loci = params.ac_loci && 'ascat' in tools ? params.ac_loci : Channel.empty() +loci_gc = params.ac_loci_gc && 'ascat' in tools ? params.ac_loci_gc : Channel.empty() +chr_dir = params.chr_dir && 'controlfreec' in tools ? params.chr_dir : Channel.empty() +chr_length = params.chr_length && 'controlfreec' in tools ? params.chr_length : Channel.empty() +mappability = params.mappability && 'controlfreec' in tools ? params.mappability : Channel.empty() + +snpeff_cache = params.snpeff_cache ?: Channel.empty() +snpeff_db = params.snpeff_db ?: Channel.empty() +snpeff_species = params.species ?: Channel.empty() +vep_cache_version = params.vep_cache_version ?: Channel.empty() +vep_cache = params.vep_cache ?: Channel.empty() // Optional files, not defined within the params.genomes[params.genome] scope -ch_cadd_indels = params.cadd_indels ? Channel.value(file(params.cadd_indels)) : "null" -ch_cadd_indels_tbi = params.cadd_indels_tbi ? Channel.value(file(params.cadd_indels_tbi)) : "null" -ch_cadd_wg_snvs = params.cadd_wg_snvs ? Channel.value(file(params.cadd_wg_snvs)) : "null" -ch_cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ? Channel.value(file(params.cadd_wg_snvs_tbi)) : "null" -ch_target_bed = params.target_bed ? Channel.value(file(params.target_bed)) : "null" +cadd_indels = params.cadd_indels ?: Channel.empty() +cadd_indels_tbi = params.cadd_indels_tbi ?: Channel.empty() +cadd_wg_snvs = params.cadd_wg_snvs ?: Channel.empty() +cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ?: Channel.empty() +target_bed = params.target_bed ?: Channel.empty() /* ================================================================================ @@ -322,8 +319,6 @@ include { MULTIQC } from './modules/nf-core/multiqc' params(params) include { BUILD_INDICES } from './modules/subworkflows/build_indices' addParams(params) -fasta.dump(tag: 'fasta') - workflow { BUILD_INDICES( @@ -332,15 +327,17 @@ workflow { germline_resource, known_indels, pon, - step) + step, + tools) bwa = params.bwa ?: BUILD_INDICES.out.bwa dbsnp_tbi = params.dbsnp ? params.dbsnp_index ?: BUILD_INDICES.out.dbsnp_tbi : Channel.empty() dict = params.dict ?: BUILD_INDICES.out.dict fai = params.fasta_fai ? params.fasta_fai : BUILD_INDICES.out.fai germline_resource_tbi = params.germline_resource ? params.germline_resource_index ?: BUILD_INDICES.out.germline_resource_tbi : Channel.empty() - intervals = params.no_intervals ? Channel.empty() : params.intervals && !('annotate' in step) ? params.intervals : BUILD_INDICES.out.intervals - known_indels_tbi = params.known_indels ? params.known_indels_index ?: BUILD_INDICES.out.known_indels_tbi.collect() : Channel.empty() + intervals = params.no_intervals ? Channel.empty() : params.intervals ?: BUILD_INDICES.out.intervals + known_indels_tbi = params.known_indels ? params.known_indels_index ?: BUILD_INDICES.out.known_indels_tbi : Channel.empty() + // known_indels_tbi = params.known_indels ? params.known_indels_index ?: BUILD_INDICES.out.known_indels_tbi.collect() : Channel.empty() pon_tbi = params.pon ? params.pon_index ?: BUILD_INDICES.out.pon_tbi : Channel.empty() // PREPROCESSING @@ -2292,7 +2289,7 @@ workflow.onComplete { // input: // set idPatient, idSample, file(bam), file(bai) from bamAscat -// file(acLoci) from ch_ac_loci +// file(acLoci) from loci // file(dict) from dict // file(fasta) from fasta // file(fastaFai) from fai @@ -2365,7 +2362,7 @@ workflow.onComplete { // input: // set idPatient, idSampleNormal, idSampleTumor, file(bafNormal), file(logrNormal), file(bafTumor), file(logrTumor) from convertAlleleCountsOut -// file(acLociGC) from ch_ac_loci_gc +// file(acLociGC) from loci_gc // output: // set val("ASCAT"), idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}.*.{png,txt}") into ascatOut @@ -2510,9 +2507,9 @@ workflow.onComplete { // input: // set idPatient, idSampleNormal, idSampleTumor, file(mpileupNormal), file(mpileupTumor) from mpileupOut -// file(chrDir) from ch_chr_dir -// file(mappability) from ch_mappability -// file(chrLength) from ch_chr_length +// file(chrDir) from chr_dir +// file(mappability) from mappability +// file(chrLength) from chr_length // file(dbsnp) from dbsnp // file(dbsnpIndex) from dbsnp_tbi // file(fasta) from fasta @@ -2801,8 +2798,8 @@ workflow.onComplete { // input: // set variantCaller, idSample, file(vcf) from vcfSnpeff -// file(dataDir) from ch_snpeff_cache -// val snpeffDb from ch_snpeff_db +// file(dataDir) from snpeff_cache +// val snpeffDb from snpeff_db // output: // set file("${reducedVCF}_snpEff.genes.txt"), file("${reducedVCF}_snpEff.html"), file("${reducedVCF}_snpEff.csv") into snpeffReport @@ -2867,13 +2864,12 @@ workflow.onComplete { // input: // set variantCaller, idSample, file(vcf), file(idx) from vcfVep -// file(dataDir) from ch_vep_cache -// val cache_version from ch_vep_cache_version -// file(cadd_InDels) from ch_cadd_indels -// file(cadd_InDels_tbi) from ch_cadd_indels_tbi -// file(cadd_WG_SNVs) from ch_cadd_wg_snvs -// file(cadd_WG_SNVs_tbi) from ch_cadd_wg_snvs_tbi - +// file(dataDir) from vep_cache +// val cache_version from vep_cache_version +// file(cadd_InDels) from cadd_indels +// file(cadd_InDels_tbi) from cadd_indels_tbi +// file(cadd_WG_SNVs) from cadd_wg_snvs +// file(cadd_WG_SNVs_tbi) from cadd_wg_snvs_tbi // output: // set variantCaller, idSample, file("${reducedVCF}_VEP.ann.vcf") into vepVCF // file("${reducedVCF}_VEP.summary.html") into vepReport @@ -2930,13 +2926,12 @@ workflow.onComplete { // input: // set variantCaller, idSample, file(vcf), file(idx) from compressVCFsnpEffOut -// file(dataDir) from ch_vep_cache -// val cache_version from ch_vep_cache_version -// file(cadd_InDels) from ch_cadd_indels -// file(cadd_InDels_tbi) from ch_cadd_indels_tbi -// file(cadd_WG_SNVs) from ch_cadd_wg_snvs -// file(cadd_WG_SNVs_tbi) from ch_cadd_wg_snvs_tbi - +// file(dataDir) from vep_cache +// val cache_version from vep_cache_version +// file(cadd_InDels) from cadd_indels +// file(cadd_InDels_tbi) from cadd_indels_tbi +// file(cadd_WG_SNVs) from cadd_wg_snvs +// file(cadd_WG_SNVs_tbi) from cadd_wg_snvs_tbi // output: // set variantCaller, idSample, file("${reducedVCF}_VEP.ann.vcf") into vepVCFmerge // file("${reducedVCF}_VEP.summary.html") into vepReportMerge diff --git a/modules/subworkflows/build_indices.nf b/modules/subworkflows/build_indices.nf index f45547cbd5..a64e87698c 100644 --- a/modules/subworkflows/build_indices.nf +++ b/modules/subworkflows/build_indices.nf @@ -25,6 +25,7 @@ workflow BUILD_INDICES{ known_indels pon step + tools main: From a1451dd5910e01cb59e55b529915ee6c8ab8b8a7 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 15:19:13 +0200 Subject: [PATCH 037/200] better params --- main.nf | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/main.nf b/main.nf index 123c176fea..50011a75c0 100644 --- a/main.nf +++ b/main.nf @@ -200,23 +200,21 @@ params.species = params.genome ? params.genomes[params.genome].s params.vep_cache_version = params.genome ? params.genomes[params.genome].vep_cache_version ?: false : false // Initialize channels based on params -fasta = params.fasta && !('annotate' in step) ? params.fasta : Channel.empty() -dbsnp = params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || params.sentieon) ? params.dbsnp : Channel.empty() -germline_resource = params.germline_resource && 'mutect2' in tools ? params.germline_resource : Channel.empty() -known_indels = params.known_indels && ('mapping' in step || 'preparerecalibration' in step) ? params.known_indels : Channel.empty() -pon = params.pon ? params.pon : Channel.empty() - -loci = params.ac_loci && 'ascat' in tools ? params.ac_loci : Channel.empty() -loci_gc = params.ac_loci_gc && 'ascat' in tools ? params.ac_loci_gc : Channel.empty() -chr_dir = params.chr_dir && 'controlfreec' in tools ? params.chr_dir : Channel.empty() -chr_length = params.chr_length && 'controlfreec' in tools ? params.chr_length : Channel.empty() -mappability = params.mappability && 'controlfreec' in tools ? params.mappability : Channel.empty() - +chr_dir = params.chr_dir ?: Channel.empty() +chr_length = params.chr_length ?: Channel.empty() +dbsnp = params.dbsnp ?: Channel.empty() +fasta = params.fasta ?: Channel.empty() +germline_resource = params.germline_resource ?: Channel.empty() +known_indels = params.known_indels ?: Channel.empty() +loci = params.ac_loci ?: Channel.empty() +loci_gc = params.ac_loci_gc ?: Channel.empty() +mappability = params.mappability ?: Channel.empty() +pon = params.pon ?: Channel.empty() snpeff_cache = params.snpeff_cache ?: Channel.empty() snpeff_db = params.snpeff_db ?: Channel.empty() snpeff_species = params.species ?: Channel.empty() -vep_cache_version = params.vep_cache_version ?: Channel.empty() vep_cache = params.vep_cache ?: Channel.empty() +vep_cache_version = params.vep_cache_version ?: Channel.empty() // Optional files, not defined within the params.genomes[params.genome] scope cadd_indels = params.cadd_indels ?: Channel.empty() From a992f166bade964d4380ff4cf4cbff4cec15520a Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Thu, 16 Jul 2020 15:30:42 +0200 Subject: [PATCH 038/200] Add merge_mapped_bam module --- modules/local/merge_mapped_bam.nf | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 modules/local/merge_mapped_bam.nf diff --git a/modules/local/merge_mapped_bam.nf b/modules/local/merge_mapped_bam.nf new file mode 100644 index 0000000000..6cae594c10 --- /dev/null +++ b/modules/local/merge_mapped_bam.nf @@ -0,0 +1,16 @@ +process MergeBamMapped { + label 'cpus' + + tag "${idPatient}-${idSample}" + + input: + tuple idPatient, idSample, idRun, path(bam) // from multiple + + output: + tuple idPatient, idSample, path("${idSample}.bam") //into bam_mapped_mer + + script: + """ + samtools merge --threads ${task.cpus} ${idSample}.bam ${bam} + """ +} \ No newline at end of file From 059079e7f5ae04f76ac11d14b2cf7ea44c98fcf7 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 15:44:53 +0200 Subject: [PATCH 039/200] CREATE_INTERVALS_BED is now in build_indices subworkflow --- main.nf | 43 ++++++++++++--------------- modules/subworkflows/build_indices.nf | 16 ++++++---- 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/main.nf b/main.nf index 50011a75c0..75240b1fa6 100644 --- a/main.nf +++ b/main.nf @@ -86,9 +86,9 @@ tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase().r if (step == 'controlfreec') tools = ['controlfreec'] if (!check_parameter_list(tools, tool_list)) exit 1, 'Unknown tool(s), see --help for more information' -// skip__qc_list = define_skip_qc_list() -// skipQC = params.skip_qc ? params.skip_qc == 'all' ? skip__qc_list : params.skip_qc.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] -// if (!check_parameter_list(skipQC, skip__qc_list)) exit 1, 'Unknown QC tool(s), see --help for more information' +// skip_qc_list = define_skip_qc_list() +// skip_qc = params.skip_qc ? params.skip_qc == 'all' ? skip_qc_list : params.skip_qc.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] +// if (!check_parameter_list(skip_qc, skip_qc_list)) exit 1, 'Unknown QC tool(s), see --help for more information' // anno_list = define_anno_list() // annotate_tools = params.annotate_tools ? params.annotate_tools.split(',').collect{it.trim().toLowerCase().replaceAll('-', '')} : [] @@ -98,7 +98,6 @@ if (!check_parameter_list(tools, tool_list)) exit 1, 'Unknown tool(s), see --hel // if ((params.ascat_ploidy && !params.ascat_purity) || (!params.ascat_ploidy && params.ascat_purity)) exit 1, 'Please specify both --ascat_purity and --ascat_ploidy, or none of them' // if (params.cf_window && params.cf_coeff) exit 1, 'Please specify either --cf_window OR --cf_coeff, but not both of them' - // Handle input tsv_path = null if (params.input && (has_extension(params.input, "tsv") || has_extension(params.input, "vcf") || has_extension(params.input, "vcf.gz"))) tsv_path = params.input @@ -254,7 +253,6 @@ if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works */ include { BWAMEM2_MEM } from './modules/local/bwamem2_mem.nf' addParams(params) -include { CREATE_INTERVALS_BED } from './modules/local/create_intervals_bed' addParams(params) include { GET_SOFTWARE_VERSIONS } from './modules/local/get_software_versions' params(params) include { OUTPUT_DOCUMENTATION } from './modules/local/output_documentation' params(params) include { TRIM_GALORE } from './modules/local/trim_galore.nf' addParams(params) @@ -333,18 +331,15 @@ workflow { dict = params.dict ?: BUILD_INDICES.out.dict fai = params.fasta_fai ? params.fasta_fai : BUILD_INDICES.out.fai germline_resource_tbi = params.germline_resource ? params.germline_resource_index ?: BUILD_INDICES.out.germline_resource_tbi : Channel.empty() - intervals = params.no_intervals ? Channel.empty() : params.intervals ?: BUILD_INDICES.out.intervals + intervals_bed = params.no_intervals ? Channel.empty() : BUILD_INDICES.out.intervals_bed known_indels_tbi = params.known_indels ? params.known_indels_index ?: BUILD_INDICES.out.known_indels_tbi : Channel.empty() // known_indels_tbi = params.known_indels ? params.known_indels_index ?: BUILD_INDICES.out.known_indels_tbi.collect() : Channel.empty() pon_tbi = params.pon ? params.pon_index ?: BUILD_INDICES.out.pon_tbi : Channel.empty() // PREPROCESSING - if((!params.no_intervals) && step != 'annotate') - CREATE_INTERVALS_BED(intervals) // BED INTERVAL CHANNEL TRANSFORMING - ch_bed_intervals = CREATE_INTERVALS_BED.out - .flatten() + intervals_bed.flatten() .map { intervalFile -> def duration = 0.0 for (line in intervalFile.readLines()) { @@ -360,14 +355,14 @@ workflow { }.toSortedList({ a, b -> b[0] <=> a[0] }) .flatten().collate(2) .map{duration, intervalFile -> intervalFile} - ch_bed_intervals.dump(tag:'bedintervals') + intervals_bed.dump(tag:'bedintervals') if (params.no_intervals && step != 'annotate') { file("${params.outdir}/no_intervals.bed").text = "no_intervals\n" - ch_bed_intervals = Channel.from(file("${params.outdir}/no_intervals.bed")) + intervals_bed = Channel.from(file("${params.outdir}/no_intervals.bed")) } - // if(!('fastqc' in skipQC)) + // if(!('fastqc' in skip_qc)) FASTQC(input_sample) if (params.trim_fastq) { @@ -436,7 +431,7 @@ workflow.onComplete { // output: // file("*.{html,zip}") into fastQCBAMReport -// when: !('fastqc' in skipQC) +// when: !('fastqc' in skip_qc) // script: // """ @@ -649,7 +644,7 @@ workflow.onComplete { // script: // markdup_java_options = task.memory.toGiga() > 8 ? params.markdup_java_options : "\"-Xms" + (task.memory.toGiga() / 2).trunc() + "g -Xmx" + (task.memory.toGiga() - 1) + "g\"" -// metrics = 'markduplicates' in skipQC ? '' : "-M ${idSample}.bam.metrics" +// metrics = 'markduplicates' in skip_qc ? '' : "-M ${idSample}.bam.metrics" // if (params.no_gatk_spark) // """ // gatk --java-options ${markdup_java_options} \ @@ -699,7 +694,7 @@ workflow.onComplete { // ["duplicates_marked_no_table_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] // } -// if ('markduplicates' in skipQC) duplicates_marked_report.close() +// if ('markduplicates' in skip_qc) duplicates_marked_report.close() // if (step == 'preparerecalibration') bam_duplicates_marked = input_sample @@ -725,7 +720,7 @@ workflow.onComplete { // publishDir params.outdir, mode: params.publish_dir_mode, // saveAs: { -// if (it == "${idSample}_*.txt" && 'sentieon' in skipQC) null +// if (it == "${idSample}_*.txt" && 'sentieon' in skip_qc) null // else if (it == "${idSample}_*.txt") "Reports/${idSample}/Sentieon/${it}" // else "Preprocessing/${idSample}/DedupedSentieon/${it}" // } @@ -854,7 +849,7 @@ workflow.onComplete { // """ // } -// if ('baserecalibrator' in skipQC) baseRecalibratorReport.close() +// if ('baserecalibrator' in skip_qc) baseRecalibratorReport.close() // recalTable = recalTable.dump(tag:'RECAL TABLE') @@ -964,7 +959,7 @@ workflow.onComplete { // publishDir params.outdir, mode: params.publish_dir_mode, // saveAs: { -// if (it == "${idSample}_recal_result.csv" && 'sentieon' in skipQC) "Reports/${idSample}/Sentieon/${it}" +// if (it == "${idSample}_recal_result.csv" && 'sentieon' in skip_qc) "Reports/${idSample}/Sentieon/${it}" // else "Preprocessing/${idSample}/RecalSentieon/${it}" // } @@ -1154,7 +1149,7 @@ workflow.onComplete { // output: // file ("${bam}.samtools.stats.out") into samtoolsStatsReport -// when: !('samtools' in skipQC) +// when: !('samtools' in skip_qc) // script: // """ @@ -1181,7 +1176,7 @@ workflow.onComplete { // output: // file("${bam.baseName}") into bamQCReport -// when: !('bamqc' in skipQC) +// when: !('bamqc' in skip_qc) // script: // use_bed = params.target_bed ? "-gff ${targetBED}" : '' @@ -2681,7 +2676,7 @@ workflow.onComplete { // output: // file ("*.bcf.tools.stats.out") into bcftoolsReport -// when: !('bcftools' in skipQC) +// when: !('bcftools' in skip_qc) // script: // """ @@ -2704,7 +2699,7 @@ workflow.onComplete { // output: // file ("${reduceVCF(vcf.fileName)}.*") into vcftoolsReport -// when: !('vcftools' in skipQC) +// when: !('vcftools' in skip_qc) // script: // """ @@ -3025,7 +3020,7 @@ workflow.onComplete { // file "*_data" // file "multiqc_plots" -// when: !('multiqc' in skipQC) +// when: !('multiqc' in skip_qc) // script: // rtitle = custom_runName ? "--title \"$custom_runName\"" : '' diff --git a/modules/subworkflows/build_indices.nf b/modules/subworkflows/build_indices.nf index a64e87698c..731ce3bfbb 100644 --- a/modules/subworkflows/build_indices.nf +++ b/modules/subworkflows/build_indices.nf @@ -6,8 +6,9 @@ // And then initialize channels based on params or indexes that were just built -include { BUILD_INTERVALS } from '../local/build_intervals.nf' -include { BWAMEM2_INDEX } from '../nf-core/bwamem2_index.nf' +include { BUILD_INTERVALS } from '../local/build_intervals.nf' +include { BWAMEM2_INDEX } from '../nf-core/bwamem2_index.nf' +include { CREATE_INTERVALS_BED } from '../local/create_intervals_bed.nf' include { GATK_CREATE_SEQUENCE_DICTIONARY } from '../local/gatk_dict.nf' include { HTSLIB_TABIX as HTSLIB_TABIX_DBSNP; @@ -15,7 +16,7 @@ include { HTSLIB_TABIX as HTSLIB_TABIX_KNOWN_INDELS; HTSLIB_TABIX as HTSLIB_TABIX_PON; } from '../nf-core/htslib_tabix' -include { SAMTOOLS_FAIDX } from '../nf-core/samtools_faidx.nf' +include { SAMTOOLS_FAIDX } from '../nf-core/samtools_faidx.nf' workflow BUILD_INDICES{ take: @@ -64,8 +65,11 @@ workflow BUILD_INDICES{ else result_pon_tbi = Channel.empty() - if (!(params.intervals) && !('annotate' in step) && !('controlfreec' in step)) - result_intervals = BUILD_INTERVALS(SAMTOOLS_FAIDX.out) + if (!('annotate' in step) && !('controlfreec' in step)) + if (!params.intervals) + result_intervals = CREATE_INTERVALS_BED(BUILD_INTERVALS(SAMTOOLS_FAIDX.out)) + else + result_intervals = CREATE_INTERVALS_BED(params.intervals) else result_intervals = Channel.empty() @@ -75,7 +79,7 @@ workflow BUILD_INDICES{ dict = result_dict fai = result_fai germline_resource_tbi = result_germline_resource_tbi - intervals = result_intervals + intervals_bed = result_intervals known_indels_tbi = result_known_indels_tbi pon_tbi = result_pon_tbi } From 87df112641f1578b4b2e8c9184fee3fce61fb74d Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 15:57:44 +0200 Subject: [PATCH 040/200] update Nextflow version to 20.06.0-edge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bb70fe3771..353d9e3194 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ > **An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing** -[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A520.04.1-brightgreen.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A520.06.0--edge-brightgreen.svg)](https://www.nextflow.io/) [![nf-core](https://img.shields.io/badge/nf--core-pipeline-brightgreen.svg)](https://nf-co.re/) [![DOI](https://zenodo.org/badge/184289291.svg)](https://zenodo.org/badge/latestdoi/184289291) From 5d53898af76073861b019a59c67e4ed1d1b8660a Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 16:05:40 +0200 Subject: [PATCH 041/200] skip_qc is back + collect() too as it was actually useful --- main.nf | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/main.nf b/main.nf index 75240b1fa6..0c704af8fb 100644 --- a/main.nf +++ b/main.nf @@ -38,6 +38,7 @@ if (params.help) { include { check_parameter_existence; check_parameter_list; + define_skip_qc_list; define_step_list; define_tool_list; extract_bam; @@ -86,9 +87,9 @@ tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase().r if (step == 'controlfreec') tools = ['controlfreec'] if (!check_parameter_list(tools, tool_list)) exit 1, 'Unknown tool(s), see --help for more information' -// skip_qc_list = define_skip_qc_list() -// skip_qc = params.skip_qc ? params.skip_qc == 'all' ? skip_qc_list : params.skip_qc.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] -// if (!check_parameter_list(skip_qc, skip_qc_list)) exit 1, 'Unknown QC tool(s), see --help for more information' +skip_qc_list = define_skip_qc_list() +skip_qc = params.skip_qc ? params.skip_qc == 'all' ? skip_qc_list : params.skip_qc.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] +if (!check_parameter_list(skip_qc, skip_qc_list)) exit 1, 'Unknown QC tool(s), see --help for more information' // anno_list = define_anno_list() // annotate_tools = params.annotate_tools ? params.annotate_tools.split(',').collect{it.trim().toLowerCase().replaceAll('-', '')} : [] @@ -362,8 +363,10 @@ workflow { intervals_bed = Channel.from(file("${params.outdir}/no_intervals.bed")) } - // if(!('fastqc' in skip_qc)) - FASTQC(input_sample) + if(!('fastqc' in skip_qc)) + result_fastqc = FASTQC(input_sample) + else + result_fastqc = Channel.empty() if (params.trim_fastq) { TRIM_GALORE(input_sample) @@ -382,11 +385,11 @@ workflow { GET_SOFTWARE_VERSIONS() MULTIQC( - FASTQC.out.ifEmpty([]), + result_fastqc.collect().ifEmpty([]), multiqc_config, multiqc_custom_config.ifEmpty([]), GET_SOFTWARE_VERSIONS.out.yml, - result_trim_galore.ifEmpty([]), + result_trim_galore.collect().ifEmpty([]), workflow_summary) } From 1756f8105985e39a249293c4bca04a73ec95d2d2 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 16:22:32 +0200 Subject: [PATCH 042/200] fix bwa-mem2 version --- bin/scrape_software_versions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index 2d878f7f72..5f42871207 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -7,7 +7,7 @@ 'AlleleCount': ['v_allelecount.txt', r"(\S+)"], 'ASCAT': ['v_ascat.txt', r"Version: (\S+)"], 'bcftools': ['v_bcftools.txt', r"bcftools (\S+)"], - 'BWAMEM2': ['v_bwamem2.txt', r"Version: (\S+)"], + 'BWA-MEM2': ['v_bwamem2.txt', r"(\S+)"], 'CNVkit': ['v_cnvkit.txt', r"(\S+)"], 'Control-FREEC': ['v_controlfreec.txt', r"Control-FREEC\s(\S+)"], 'FastQC': ['v_fastqc.txt', r"FastQC v(\S+)"], From be8b3d2e7321b86b9fccfd49e02ecec3ac97d166 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 16:22:59 +0200 Subject: [PATCH 043/200] remove unused params(params) and addParams(params) --- main.nf | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/main.nf b/main.nf index 0c704af8fb..95ee1168f4 100644 --- a/main.nf +++ b/main.nf @@ -253,10 +253,18 @@ if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works ================================================================================ */ -include { BWAMEM2_MEM } from './modules/local/bwamem2_mem.nf' addParams(params) -include { GET_SOFTWARE_VERSIONS } from './modules/local/get_software_versions' params(params) -include { OUTPUT_DOCUMENTATION } from './modules/local/output_documentation' params(params) -include { TRIM_GALORE } from './modules/local/trim_galore.nf' addParams(params) +include { BWAMEM2_MEM } from './modules/local/bwamem2_mem.nf' +include { GET_SOFTWARE_VERSIONS } from './modules/local/get_software_versions' +include { OUTPUT_DOCUMENTATION } from './modules/local/output_documentation' +include { TRIM_GALORE } from './modules/local/trim_galore.nf' + +/* +================================================================================ + INCLUDE LOCAL PIPELINE SUBWORKFLOWS +================================================================================ +*/ + +include { BUILD_INDICES } from './modules/subworkflows/build_indices' /* ================================================================================ @@ -264,8 +272,8 @@ include { TRIM_GALORE } from './modules/local/trim_galore.nf' addParam ================================================================================ */ -include { FASTQC } from './modules/nf-core/fastqc' params(params) -include { MULTIQC } from './modules/nf-core/multiqc' params(params) +include { FASTQC } from './modules/nf-core/fastqc' +include { MULTIQC } from './modules/nf-core/multiqc' // PREPARING CHANNELS FOR PREPROCESSING AND QC @@ -314,8 +322,6 @@ include { MULTIQC } from './modules/nf-core/multiqc' params(params) ================================================================================ */ -include { BUILD_INDICES } from './modules/subworkflows/build_indices' addParams(params) - workflow { BUILD_INDICES( From f683c50d5d575d0b59687716313f83a06069f90d Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 16:24:46 +0200 Subject: [PATCH 044/200] center headers --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 95ee1168f4..277cba5e4c 100644 --- a/main.nf +++ b/main.nf @@ -249,7 +249,7 @@ if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works /* ================================================================================ - INCLUDE LOCAL PIPELINE MODULES + INCLUDE LOCAL PIPELINE MODULES ================================================================================ */ @@ -260,7 +260,7 @@ include { TRIM_GALORE } from './modules/local/trim_galore.nf' /* ================================================================================ - INCLUDE LOCAL PIPELINE SUBWORKFLOWS + INCLUDE LOCAL PIPELINE SUBWORKFLOWS ================================================================================ */ From 5d8fc2b55c0bf36e4ce757816b2a28226b7d5de7 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 17:07:43 +0200 Subject: [PATCH 045/200] all intervals stuff in out of main.nf --- main.nf | 28 ++------------------------- modules/subworkflows/build_indices.nf | 26 ++++++++++++++++++++++--- 2 files changed, 25 insertions(+), 29 deletions(-) diff --git a/main.nf b/main.nf index 277cba5e4c..116f6d69bb 100644 --- a/main.nf +++ b/main.nf @@ -338,37 +338,13 @@ workflow { dict = params.dict ?: BUILD_INDICES.out.dict fai = params.fasta_fai ? params.fasta_fai : BUILD_INDICES.out.fai germline_resource_tbi = params.germline_resource ? params.germline_resource_index ?: BUILD_INDICES.out.germline_resource_tbi : Channel.empty() - intervals_bed = params.no_intervals ? Channel.empty() : BUILD_INDICES.out.intervals_bed - known_indels_tbi = params.known_indels ? params.known_indels_index ?: BUILD_INDICES.out.known_indels_tbi : Channel.empty() - // known_indels_tbi = params.known_indels ? params.known_indels_index ?: BUILD_INDICES.out.known_indels_tbi.collect() : Channel.empty() + intervals_bed = BUILD_INDICES.out.intervals_bed + known_indels_tbi = params.known_indels ? params.known_indels_index ?: BUILD_INDICES.out.known_indels_tbi.collect() : Channel.empty() pon_tbi = params.pon ? params.pon_index ?: BUILD_INDICES.out.pon_tbi : Channel.empty() // PREPROCESSING - - // BED INTERVAL CHANNEL TRANSFORMING - intervals_bed.flatten() - .map { intervalFile -> - def duration = 0.0 - for (line in intervalFile.readLines()) { - final fields = line.split('\t') - if (fields.size() >= 5) duration += fields[4].toFloat() - else { - start = fields[1].toInteger() - end = fields[2].toInteger() - duration += (end - start) / params.nucleotides_per_second - } - } - [ duration, intervalFile] - }.toSortedList({ a, b -> b[0] <=> a[0] }) - .flatten().collate(2) - .map{duration, intervalFile -> intervalFile} intervals_bed.dump(tag:'bedintervals') - if (params.no_intervals && step != 'annotate') { - file("${params.outdir}/no_intervals.bed").text = "no_intervals\n" - intervals_bed = Channel.from(file("${params.outdir}/no_intervals.bed")) - } - if(!('fastqc' in skip_qc)) result_fastqc = FASTQC(input_sample) else diff --git a/modules/subworkflows/build_indices.nf b/modules/subworkflows/build_indices.nf index 731ce3bfbb..bc0fa1ba51 100644 --- a/modules/subworkflows/build_indices.nf +++ b/modules/subworkflows/build_indices.nf @@ -65,13 +65,33 @@ workflow BUILD_INDICES{ else result_pon_tbi = Channel.empty() - if (!('annotate' in step) && !('controlfreec' in step)) + if (params.no_intervals) { + file("${params.outdir}/no_intervals.bed").text = "no_intervals\n" + result_intervals = Channel.from(file("${params.outdir}/no_intervals.bed")) + } else if (!('annotate' in step) && !('controlfreec' in step)) if (!params.intervals) result_intervals = CREATE_INTERVALS_BED(BUILD_INTERVALS(SAMTOOLS_FAIDX.out)) else result_intervals = CREATE_INTERVALS_BED(params.intervals) - else - result_intervals = Channel.empty() + + if (!params.no_intervals) { + result_intervals.flatten() + .map { intervalFile -> + def duration = 0.0 + for (line in intervalFile.readLines()) { + final fields = line.split('\t') + if (fields.size() >= 5) duration += fields[4].toFloat() + else { + start = fields[1].toInteger() + end = fields[2].toInteger() + duration += (end - start) / params.nucleotides_per_second + } + } + [duration, intervalFile] + }.toSortedList({ a, b -> b[0] <=> a[0] }) + .flatten().collate(2) + .map{duration, intervalFile -> intervalFile} + } emit: bwa = result_bwa From b8df2ddf63796da91ccc3a55c8cce28134441d99 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 18:58:15 +0200 Subject: [PATCH 046/200] restore multiple channels for bedintervals --- modules/subworkflows/build_indices.nf | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/modules/subworkflows/build_indices.nf b/modules/subworkflows/build_indices.nf index bc0fa1ba51..20ac7cf005 100644 --- a/modules/subworkflows/build_indices.nf +++ b/modules/subworkflows/build_indices.nf @@ -70,12 +70,12 @@ workflow BUILD_INDICES{ result_intervals = Channel.from(file("${params.outdir}/no_intervals.bed")) } else if (!('annotate' in step) && !('controlfreec' in step)) if (!params.intervals) - result_intervals = CREATE_INTERVALS_BED(BUILD_INTERVALS(SAMTOOLS_FAIDX.out)) + intervals = CREATE_INTERVALS_BED(BUILD_INTERVALS(SAMTOOLS_FAIDX.out)) else - result_intervals = CREATE_INTERVALS_BED(params.intervals) + intervals = CREATE_INTERVALS_BED(params.intervals) if (!params.no_intervals) { - result_intervals.flatten() + intervals.flatten() .map { intervalFile -> def duration = 0.0 for (line in intervalFile.readLines()) { @@ -91,6 +91,11 @@ workflow BUILD_INDICES{ }.toSortedList({ a, b -> b[0] <=> a[0] }) .flatten().collate(2) .map{duration, intervalFile -> intervalFile} + .multiMap{ + all: it + empty: "" + }.set{bed} + result_intervals = bed.all } emit: From 1e15297771664e5829ace89ee7cdcb52ab73ab26 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 19:04:01 +0200 Subject: [PATCH 047/200] code polishing --- modules/subworkflows/build_indices.nf | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/modules/subworkflows/build_indices.nf b/modules/subworkflows/build_indices.nf index 20ac7cf005..588571a4e3 100644 --- a/modules/subworkflows/build_indices.nf +++ b/modules/subworkflows/build_indices.nf @@ -70,12 +70,12 @@ workflow BUILD_INDICES{ result_intervals = Channel.from(file("${params.outdir}/no_intervals.bed")) } else if (!('annotate' in step) && !('controlfreec' in step)) if (!params.intervals) - intervals = CREATE_INTERVALS_BED(BUILD_INTERVALS(SAMTOOLS_FAIDX.out)) + result_intervals = CREATE_INTERVALS_BED(BUILD_INTERVALS(SAMTOOLS_FAIDX.out)) else - intervals = CREATE_INTERVALS_BED(params.intervals) + result_intervals = CREATE_INTERVALS_BED(params.intervals) if (!params.no_intervals) { - intervals.flatten() + result_intervals = result_intervals.flatten() .map { intervalFile -> def duration = 0.0 for (line in intervalFile.readLines()) { @@ -91,11 +91,7 @@ workflow BUILD_INDICES{ }.toSortedList({ a, b -> b[0] <=> a[0] }) .flatten().collate(2) .map{duration, intervalFile -> intervalFile} - .multiMap{ - all: it - empty: "" - }.set{bed} - result_intervals = bed.all + .flatten() } emit: From cfb2ffdfcf6886d5ccb3a3730aa35422bc31c5d7 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 16 Jul 2020 19:04:55 +0200 Subject: [PATCH 048/200] further polishing --- modules/subworkflows/build_indices.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/subworkflows/build_indices.nf b/modules/subworkflows/build_indices.nf index 588571a4e3..f7868fb104 100644 --- a/modules/subworkflows/build_indices.nf +++ b/modules/subworkflows/build_indices.nf @@ -91,7 +91,6 @@ workflow BUILD_INDICES{ }.toSortedList({ a, b -> b[0] <=> a[0] }) .flatten().collate(2) .map{duration, intervalFile -> intervalFile} - .flatten() } emit: From 0c4d62aeef8c5eff494b6610fd58c69c7e7a1bd2 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Fri, 17 Jul 2020 09:41:24 +0200 Subject: [PATCH 049/200] Add merge_mapped to wf --- main.nf | 18 +++++++++++++++++- modules/local/merge_mapped_bam.nf | 6 +++--- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index 673aef0629..bbc31f09be 100644 --- a/main.nf +++ b/main.nf @@ -263,7 +263,7 @@ include { CREATE_INTERVALS_BED } from './modules/local/create_intervals_bed' ad include { GET_SOFTWARE_VERSIONS } from './modules/local/get_software_versions' params(params) include { OUTPUT_DOCUMENTATION } from './modules/local/output_documentation' params(params) include { TRIM_GALORE } from './modules/local/trim_galore.nf' addParams(params) - +include { MERGE_BAM_MAPPED } from './modules/local/merge_mapped_bam' addParams(params) /* ================================================================================ INCLUDE nf-core PIPELINE MODULES @@ -385,6 +385,22 @@ workflow { BWAMEM2_MEM(input_sample, bwa, fasta, fai) } + BWAMEM2_MEM.out.groupTuple(by:[0, 1]) + .branch { + single: it[2].size() == 1 + multiple: it[2].size() > 1 + }.set { bam } + bam.single.map { + idPatient, idSample, idRun, bam -> + [idPatient, idSample, bam] + } + + bam.single.view() + bam.multiple.view() + + //multipleBam = multipleBam.mix(multipleBamSentieon) + MERGE_BAM_MAPPED(bam.multiple) + OUTPUT_DOCUMENTATION( output_docs, output_docs_images) diff --git a/modules/local/merge_mapped_bam.nf b/modules/local/merge_mapped_bam.nf index 6cae594c10..9bbf8fceaf 100644 --- a/modules/local/merge_mapped_bam.nf +++ b/modules/local/merge_mapped_bam.nf @@ -1,13 +1,13 @@ -process MergeBamMapped { +process MERGE_BAM_MAPPED { label 'cpus' tag "${idPatient}-${idSample}" input: - tuple idPatient, idSample, idRun, path(bam) // from multiple + tuple idPatient, idSample, idRun, path(bam), path(bai)// from multiple output: - tuple idPatient, idSample, path("${idSample}.bam") //into bam_mapped_mer + tuple idPatient, idSample, path("${idSample}.bam") //into bam_mapped_merged script: """ From e4c3ca8d20df7d17d020c9e62724351aab0f1862 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 17 Jul 2020 11:37:10 +0200 Subject: [PATCH 050/200] bam_mapped have single and multiple bams --- main.nf | 15 ++++++++------- modules/local/merge_mapped_bam.nf | 9 +++++---- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/main.nf b/main.nf index da9009b517..cd67ccf88f 100644 --- a/main.nf +++ b/main.nf @@ -367,16 +367,17 @@ workflow { single: it[2].size() == 1 multiple: it[2].size() > 1 }.set { bam } - bam.single.map { - idPatient, idSample, idRun, bam -> - [idPatient, idSample, bam] - } - bam.single.view() - bam.multiple.view() + bam_single = bam.single.map { + idPatient, idSample, idRun, bam, bai -> + [idPatient, idSample, bam[0], bai[0]] + } //multipleBam = multipleBam.mix(multipleBamSentieon) - MERGE_BAM_MAPPED(bam.multiple) + + bam_mapped = bam_single.mix(MERGE_BAM_MAPPED(bam.multiple)) + + bam_mapped.view() OUTPUT_DOCUMENTATION( output_docs, diff --git a/modules/local/merge_mapped_bam.nf b/modules/local/merge_mapped_bam.nf index 9bbf8fceaf..8e70342bbd 100644 --- a/modules/local/merge_mapped_bam.nf +++ b/modules/local/merge_mapped_bam.nf @@ -1,16 +1,17 @@ process MERGE_BAM_MAPPED { label 'cpus' - tag "${idPatient}-${idSample}" + tag "${patient}-${sample}" input: - tuple idPatient, idSample, idRun, path(bam), path(bai)// from multiple + tuple patient, sample, run, path(bam), path(bai) output: - tuple idPatient, idSample, path("${idSample}.bam") //into bam_mapped_merged + tuple patient, sample, path("${sample}.bam"), path("${sample}.bam.bai") script: """ - samtools merge --threads ${task.cpus} ${idSample}.bam ${bam} + samtools merge --threads ${task.cpus} ${sample}.bam ${bam} + samtools index ${sample}.bam """ } \ No newline at end of file From 2385f17b998d4d1a24285beafaa1f63ffbd19ead Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Fri, 17 Jul 2020 16:06:14 +0200 Subject: [PATCH 051/200] Add MarkDuplicates --- main.nf | 35 +++--------------------- modules/local/mark_duplicates.nf | 46 ++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 31 deletions(-) create mode 100644 modules/local/mark_duplicates.nf diff --git a/main.nf b/main.nf index cd67ccf88f..0e70053187 100644 --- a/main.nf +++ b/main.nf @@ -259,6 +259,7 @@ include { GET_SOFTWARE_VERSIONS } from './modules/local/get_software_versions' include { OUTPUT_DOCUMENTATION } from './modules/local/output_documentation' include { TRIM_GALORE } from './modules/local/trim_galore.nf' include { MERGE_BAM_MAPPED } from './modules/local/merge_mapped_bam' addParams(params) +include { MARK_DUPLICATES } from './modules/local/mark_duplicates' params(params) /* ================================================================================ @@ -379,6 +380,9 @@ workflow { bam_mapped.view() + if(!(params.skip_markduplicates)) + MARK_DUPLICATES(bam_mapped) + OUTPUT_DOCUMENTATION( output_docs, output_docs_images) @@ -458,18 +462,6 @@ workflow.onComplete { // if (params.sentieon) input_pair_reads.close() // else input_pair_reads_sentieon.close() -// bamMapped = bamMapped.dump(tag:'Mapped BAM') -// // Sort BAM whether they are standalone or should be merged - -// singleBam = Channel.create() -// multipleBam = Channel.create() -// bamMapped.groupTuple(by:[0, 1]) -// .choice(singleBam, multipleBam) {it[2].size() > 1 ? 1 : 0} -// singleBam = singleBam.map { -// idPatient, idSample, idRun, bam -> -// [idPatient, idSample, bam] -// } -// singleBam = singleBam.dump(tag:'Single BAM') // // STEP 1': MAPPING READS TO REFERENCE GENOME WITH SENTIEON BWA MEM @@ -524,26 +516,7 @@ workflow.onComplete { // // STEP 1.5: MERGING BAM FROM MULTIPLE LANES -// multipleBam = multipleBam.mix(multipleBamSentieon) - -// process MergeBamMapped { -// label 'cpus_8' - -// tag "${idPatient}-${idSample}" - -// input: -// set idPatient, idSample, idRun, file(bam) from multipleBam - -// output: -// set idPatient, idSample, file("${idSample}.bam") into bam_mapped_merged - -// script: -// """ -// samtools merge --threads ${task.cpus} ${idSample}.bam ${bam} -// """ -// } -// bam_mapped_merged = bam_mapped_merged.dump(tag:'Merged BAM') // bam_mapped_merged = bam_mapped_merged.mix(singleBam,singleBamSentieon) diff --git a/modules/local/mark_duplicates.nf b/modules/local/mark_duplicates.nf new file mode 100644 index 0000000000..b4ad07edfb --- /dev/null +++ b/modules/local/mark_duplicates.nf @@ -0,0 +1,46 @@ +process MARK_DUPLICATES { + label 'cpus_16' + tag "${idPatient}-${idSample}" + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { + if (it == "${idSample}.bam.metrics") "Reports/${idSample}/MarkDuplicates/${it}" + else "Preprocessing/${idSample}/DuplicatesMarked/${it}" + } + input: + tuple idPatient, idSample, path("${idSample}.bam") + output: + tuple idPatient, idSample, path("${idSample}.md.bam"), path("${idSample}.md.bam.bai"), emit: bam_duplicates_marked + tuple idPatient, idSample, emit: tsv_bam_duplicates_marked + path "${idSample}.bam.metrics", emit: duplicates_marked_report //is optional , applies when skip_qc is used(not implemented yet) + + //when: !(params.skip_markduplicates) + + script: + markdup_java_options = task.memory.toGiga() > 8 ? params.markdup_java_options : "\"-Xms" + (task.memory.toGiga() / 2).trunc() + "g -Xmx" + (task.memory.toGiga() - 1) + "g\"" + //metrics = 'markduplicates' in skip_qc ? '' : "-M ${idSample}.bam.metrics" + metrics = "-M ${idSample}.bam.metrics" + if (params.no_gatk_spark) + """ + gatk --java-options ${markdup_java_options} \ + MarkDuplicates \ + --MAX_RECORDS_IN_RAM 50000 \ + --INPUT ${idSample}.bam \ + --METRICS_FILE ${idSample}.bam.metrics \ + --TMP_DIR . \ + --ASSUME_SORT_ORDER coordinate \ + --CREATE_INDEX true \ + --OUTPUT ${idSample}.md.bam + mv ${idSample}.md.bai ${idSample}.md.bam.bai + """ + else + """ + gatk --java-options ${markdup_java_options} \ + MarkDuplicatesSpark \ + -I ${idSample}.bam \ + -O ${idSample}.md.bam \ + ${metrics} \ + --tmp-dir . \ + --create-output-bam-index true \ + --spark-master local[${task.cpus}] + """ + } \ No newline at end of file From 6f96eb803cab6c2a6da91d95fc7833e95cd0d704 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Fri, 17 Jul 2020 22:09:54 +0200 Subject: [PATCH 052/200] Add MD to report, and add conditional execution --- main.nf | 5 ++--- modules/local/mark_duplicates.nf | 4 +--- modules/nf-core/multiqc.nf | 1 + 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/main.nf b/main.nf index 0e70053187..e0dc8286ff 100644 --- a/main.nf +++ b/main.nf @@ -380,21 +380,20 @@ workflow { bam_mapped.view() - if(!(params.skip_markduplicates)) - MARK_DUPLICATES(bam_mapped) + mark_duplicates_report = !(params.skip_markduplicates) ? MARK_DUPLICATES(bam_mapped).duplicates_marked_report : Channel.empty() OUTPUT_DOCUMENTATION( output_docs, output_docs_images) GET_SOFTWARE_VERSIONS() - MULTIQC( result_fastqc.collect().ifEmpty([]), multiqc_config, multiqc_custom_config.ifEmpty([]), GET_SOFTWARE_VERSIONS.out.yml, result_trim_galore.collect().ifEmpty([]), + mark_duplicates_report.collect().ifEmpty([]), workflow_summary) } diff --git a/modules/local/mark_duplicates.nf b/modules/local/mark_duplicates.nf index b4ad07edfb..fe1f9e8c2f 100644 --- a/modules/local/mark_duplicates.nf +++ b/modules/local/mark_duplicates.nf @@ -12,9 +12,7 @@ process MARK_DUPLICATES { tuple idPatient, idSample, path("${idSample}.md.bam"), path("${idSample}.md.bam.bai"), emit: bam_duplicates_marked tuple idPatient, idSample, emit: tsv_bam_duplicates_marked path "${idSample}.bam.metrics", emit: duplicates_marked_report //is optional , applies when skip_qc is used(not implemented yet) - - //when: !(params.skip_markduplicates) - + script: markdup_java_options = task.memory.toGiga() > 8 ? params.markdup_java_options : "\"-Xms" + (task.memory.toGiga() / 2).trunc() + "g -Xmx" + (task.memory.toGiga() - 1) + "g\"" //metrics = 'markduplicates' in skip_qc ? '' : "-M ${idSample}.bam.metrics" diff --git a/modules/nf-core/multiqc.nf b/modules/nf-core/multiqc.nf index a97e8daa17..701e1e1aa7 100644 --- a/modules/nf-core/multiqc.nf +++ b/modules/nf-core/multiqc.nf @@ -14,6 +14,7 @@ process MULTIQC { path multiqc_custom_config path software_versions path trim_galore + path mark_duplicates val workflow_summary output: From c758a88b96735488c369470a1cac0e81f1a4e64b Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Wed, 22 Jul 2020 10:26:00 +0200 Subject: [PATCH 053/200] Use 20.07-RC1, add skip_qc to MD --- main.nf | 76 ++++++++------------------------ modules/local/mark_duplicates.nf | 2 +- 2 files changed, 19 insertions(+), 59 deletions(-) diff --git a/main.nf b/main.nf index e0dc8286ff..f480cfda72 100644 --- a/main.nf +++ b/main.nf @@ -258,8 +258,9 @@ include { BWAMEM2_MEM } from './modules/local/bwamem2_mem.nf' include { GET_SOFTWARE_VERSIONS } from './modules/local/get_software_versions' include { OUTPUT_DOCUMENTATION } from './modules/local/output_documentation' include { TRIM_GALORE } from './modules/local/trim_galore.nf' -include { MERGE_BAM_MAPPED } from './modules/local/merge_mapped_bam' addParams(params) -include { MARK_DUPLICATES } from './modules/local/mark_duplicates' params(params) +include { MERGE_BAM_MAPPED } from './modules/local/merge_mapped_bam' +include { MARK_DUPLICATES } from './modules/local/mark_duplicates' addParams(skip_qc: skip_qc) +//include { BASE_RECALIBRATION } from './modules/local/base_recalibration' params(params) /* ================================================================================ @@ -346,8 +347,6 @@ workflow { pon_tbi = params.pon ? params.pon_index ?: BUILD_INDICES.out.pon_tbi : Channel.empty() // PREPROCESSING - intervals_bed.dump(tag:'bedintervals') - if(!('fastqc' in skip_qc)) result_fastqc = FASTQC(input_sample) else @@ -380,8 +379,20 @@ workflow { bam_mapped.view() - mark_duplicates_report = !(params.skip_markduplicates) ? MARK_DUPLICATES(bam_mapped).duplicates_marked_report : Channel.empty() + if(!(params.skip_markduplicates)){ + MARK_DUPLICATES(bam_mapped) + mark_duplicates_report = MARK_DUPLICATES.out.duplicates_marked_report + bam_duplicates_marked = MARK_DUPLICATES.out.bam_duplicates_marked + } + else { + mark_duplicates_report = Channel.empty() + bam_duplicates_marked = Channel.empty() + } + bamBaseRecalibrator = bam_duplicates_marked.combine(BUILD_INDICES.out.intervals_bed) + + //BASE_RECALIBRATION(bamBaseRecalibrator,dbsnp, dbsnp_index,fasta,) + OUTPUT_DOCUMENTATION( output_docs, output_docs_images) @@ -546,6 +557,7 @@ workflow.onComplete { // (bam_mapped_merged, bam_mapped_merged_to_index) = bam_mapped_merged.into(2) +//@Maxime: You included this process in merged_bam.nf, right? // process IndexBamFile { // label 'cpus_8' @@ -597,56 +609,6 @@ workflow.onComplete { // } // // STEP 2: MARKING DUPLICATES -// process MarkDuplicates { -// label 'cpus_16' - -// tag "${idPatient}-${idSample}" - -// publishDir params.outdir, mode: params.publish_dir_mode, -// saveAs: { -// if (it == "${idSample}.bam.metrics") "Reports/${idSample}/MarkDuplicates/${it}" -// else "Preprocessing/${idSample}/DuplicatesMarked/${it}" -// } - -// input: -// set idPatient, idSample, file("${idSample}.bam") from bam_mapped_merged - -// output: -// set idPatient, idSample, file("${idSample}.md.bam"), file("${idSample}.md.bam.bai") into bam_duplicates_marked -// set idPatient, idSample into tsv_bam_duplicates_marked -// file ("${idSample}.bam.metrics") optional true into duplicates_marked_report - -// when: !(params.skip_markduplicates) - -// script: -// markdup_java_options = task.memory.toGiga() > 8 ? params.markdup_java_options : "\"-Xms" + (task.memory.toGiga() / 2).trunc() + "g -Xmx" + (task.memory.toGiga() - 1) + "g\"" -// metrics = 'markduplicates' in skip_qc ? '' : "-M ${idSample}.bam.metrics" -// if (params.no_gatk_spark) -// """ -// gatk --java-options ${markdup_java_options} \ -// MarkDuplicates \ -// --MAX_RECORDS_IN_RAM 50000 \ -// --INPUT ${idSample}.bam \ -// --METRICS_FILE ${idSample}.bam.metrics \ -// --TMP_DIR . \ -// --ASSUME_SORT_ORDER coordinate \ -// --CREATE_INDEX true \ -// --OUTPUT ${idSample}.md.bam - -// mv ${idSample}.md.bai ${idSample}.md.bam.bai -// """ -// else -// """ -// gatk --java-options ${markdup_java_options} \ -// MarkDuplicatesSpark \ -// -I ${idSample}.bam \ -// -O ${idSample}.md.bam \ -// ${metrics} \ -// --tmp-dir . \ -// --create-output-bam-index true \ -// --spark-master local[${task.cpus}] -// """ -// } // (tsv_bam_duplicates_marked, tsv_bam_duplicates_marked_sample) = tsv_bam_duplicates_marked.into(2) @@ -681,9 +643,7 @@ workflow.onComplete { // (bamMD, bamMDToJoin, bam_duplicates_marked) = bam_duplicates_marked.into(3) -// bamBaseRecalibrator = bamMD.combine(intBaseRecalibrator) - -// bamBaseRecalibrator = bamBaseRecalibrator.dump(tag:'BAM FOR BASERECALIBRATOR') +// // // STEP 2': SENTIEON DEDUP diff --git a/modules/local/mark_duplicates.nf b/modules/local/mark_duplicates.nf index fe1f9e8c2f..40a6b4fad6 100644 --- a/modules/local/mark_duplicates.nf +++ b/modules/local/mark_duplicates.nf @@ -11,7 +11,7 @@ process MARK_DUPLICATES { output: tuple idPatient, idSample, path("${idSample}.md.bam"), path("${idSample}.md.bam.bai"), emit: bam_duplicates_marked tuple idPatient, idSample, emit: tsv_bam_duplicates_marked - path "${idSample}.bam.metrics", emit: duplicates_marked_report //is optional , applies when skip_qc is used(not implemented yet) + path "${idSample}.bam.metrics", optional : true, emit: duplicates_marked_report //is optional , applies when skip_qc is used(not implemented yet) script: markdup_java_options = task.memory.toGiga() > 8 ? params.markdup_java_options : "\"-Xms" + (task.memory.toGiga() / 2).trunc() + "g -Xmx" + (task.memory.toGiga() - 1) + "g\"" From b6552f2520fc64da26cd3a1cc1df7749e19d4691 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Wed, 22 Jul 2020 10:33:57 +0200 Subject: [PATCH 054/200] Add blank line for linting --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bac12f4210..4d03680099 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a - [#238](https://github.com/nf-core/sarek/pull/238) -Add subworkflow for building all the indices - [#241](https://github.com/nf-core/sarek/pull/241) -Add modules and workflows parts for preprocessing steps + ## [dev](https://github.com/nf-core/sarek/tree/dev) - [#234](https://github.com/nf-core/sarek/pull/234) -Switching to DSL2 From f34fc2fb56cfdb184df9e66168d99e4bf9242319 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Wed, 22 Jul 2020 10:39:46 +0200 Subject: [PATCH 055/200] Bump minimal version to 20.07 --- .github/workflows/ci.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 477455964d..3fa1d1f8c3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,7 +12,7 @@ jobs: strategy: matrix: # Nextflow versions: check pipeline minimum and current latest - nxf_ver: ['20.04.1', ''] + nxf_ver: ['20.07.0', ''] steps: - uses: actions/checkout@v2 - name: Install Nextflow @@ -42,7 +42,7 @@ jobs: sudo mv nextflow /usr/local/bin/ env: # Only check Nextflow pipeline minimum version - NXF_VER: '20.04.1' + NXF_VER: '20.07.0' - name: Pull docker image run: | docker pull nfcore/sarek:dev @@ -65,7 +65,7 @@ jobs: sudo mv nextflow /usr/local/bin/ env: # Only check Nextflow pipeline minimum version - NXF_VER: '19.10.0' + NXF_VER: '20.07.0' - name: Pull docker image run: docker pull nfcore/sarek:dev - name: Get test data @@ -114,7 +114,7 @@ jobs: sudo mv nextflow /usr/local/bin/ env: # Only check Nextflow pipeline minimum version - NXF_VER: '19.10.0' + NXF_VER: '20.07.0' - name: Pull docker image run: docker pull nfcore/sarek:dev - name: Run ${{ matrix.profile }} test @@ -145,7 +145,7 @@ jobs: sudo mv nextflow /usr/local/bin/ env: # Only check Nextflow pipeline minimum version - NXF_VER: '19.10.0' + NXF_VER: '20.07.0' - name: Pull docker image run: docker pull nfcore/sarek:dev - name: Run ${{ matrix.tool }} test From b151dd566b5cb7fbdcecf70cd1e24d438a8ed440 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Wed, 22 Jul 2020 10:44:40 +0200 Subject: [PATCH 056/200] Fix version naming --- .github/workflows/ci.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3fa1d1f8c3..ecaf2237ae 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,7 +12,7 @@ jobs: strategy: matrix: # Nextflow versions: check pipeline minimum and current latest - nxf_ver: ['20.07.0', ''] + nxf_ver: ['20.07.0-RC1', ''] steps: - uses: actions/checkout@v2 - name: Install Nextflow @@ -42,7 +42,7 @@ jobs: sudo mv nextflow /usr/local/bin/ env: # Only check Nextflow pipeline minimum version - NXF_VER: '20.07.0' + NXF_VER: '20.07.0-RC1' - name: Pull docker image run: | docker pull nfcore/sarek:dev @@ -65,7 +65,7 @@ jobs: sudo mv nextflow /usr/local/bin/ env: # Only check Nextflow pipeline minimum version - NXF_VER: '20.07.0' + NXF_VER: '20.07.0-RC1' - name: Pull docker image run: docker pull nfcore/sarek:dev - name: Get test data @@ -93,7 +93,7 @@ jobs: sudo mv nextflow /usr/local/bin/ env: # Only check Nextflow pipeline minimum version - NXF_VER: '19.10.0' + NXF_VER: '20.07.0-RC1' - name: Pull docker image run: docker pull nfcore/sarek:dev - name: Run test for minimal genomes @@ -114,7 +114,7 @@ jobs: sudo mv nextflow /usr/local/bin/ env: # Only check Nextflow pipeline minimum version - NXF_VER: '20.07.0' + NXF_VER: '20.07.0-RC1' - name: Pull docker image run: docker pull nfcore/sarek:dev - name: Run ${{ matrix.profile }} test @@ -145,7 +145,7 @@ jobs: sudo mv nextflow /usr/local/bin/ env: # Only check Nextflow pipeline minimum version - NXF_VER: '20.07.0' + NXF_VER: '20.07.0-RC1' - name: Pull docker image run: docker pull nfcore/sarek:dev - name: Run ${{ matrix.tool }} test From 7c78d31ffa50192c9d05b790402569d1b9c8d005 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Wed, 22 Jul 2020 10:53:46 +0200 Subject: [PATCH 057/200] use nextflow.enable.dsl and fix some minor things for that --- main.nf | 2 +- modules/local/base_recalibration.nf | 40 +++++++++++++++++++++++++++++ modules/local/mark_duplicates.nf | 12 ++++----- modules/local/merge_mapped_bam.nf | 4 +-- modules/local/trim_galore.nf | 2 +- 5 files changed, 50 insertions(+), 10 deletions(-) create mode 100644 modules/local/base_recalibration.nf diff --git a/main.nf b/main.nf index f480cfda72..e742ec8568 100644 --- a/main.nf +++ b/main.nf @@ -19,7 +19,7 @@ nf-core/sarek: -------------------------------------------------------------------------------- */ -nextflow.preview.dsl = 2 +nextflow.enable.dsl=2 // Print help message if required diff --git a/modules/local/base_recalibration.nf b/modules/local/base_recalibration.nf new file mode 100644 index 0000000000..9eca053656 --- /dev/null +++ b/modules/local/base_recalibration.nf @@ -0,0 +1,40 @@ +// process BASE_RECALIBRATION { +// label 'cpus_1' + +// tag "${idPatient}-${idSample}-${intervalBed.baseName}" + +// input: +// tuple idPatient, idSample, file(bam), file(bai), file(intervalBed) //from bamBaseRecalibrator +// path dbsnp //from dbsnp +// path dbsnpIndex// from dbsnp_tbi +// path fasta //from fasta +// path dict // from dict +// path fastaFai // from fai +// path knownIndels // from known_indels +// path knownIndelsIndex // from known_indels_tbi + +// output: +// tuple idPatient, idSample, file "${prefix}${idSample}.recal.table", emit: tableGatherBQSRReports +// tuple idPatient, idSample, emit: recalTableTSVnoInt + +// //when: params.known_indels + +// script: +// dbsnpOptions = params.dbsnp ? "--known-sites ${dbsnp}" : "" +// knownOptions = params.known_indels ? knownIndels.collect{"--known-sites ${it}"}.join(' ') : "" +// prefix = params.no_intervals ? "" : "${intervalBed.baseName}_" +// intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" +// // TODO: --use-original-qualities ??? +// """ +// gatk --java-options -Xmx${task.memory.toGiga()}g \ +// BaseRecalibrator \ +// -I ${bam} \ +// -O ${prefix}${idSample}.recal.table \ +// --tmp-dir . \ +// -R ${fasta} \ +// ${intervalsOptions} \ +// ${dbsnpOptions} \ +// ${knownOptions} \ +// --verbosity INFO +// """ +// } \ No newline at end of file diff --git a/modules/local/mark_duplicates.nf b/modules/local/mark_duplicates.nf index 40a6b4fad6..c2cae3d722 100644 --- a/modules/local/mark_duplicates.nf +++ b/modules/local/mark_duplicates.nf @@ -7,16 +7,16 @@ process MARK_DUPLICATES { else "Preprocessing/${idSample}/DuplicatesMarked/${it}" } input: - tuple idPatient, idSample, path("${idSample}.bam") + tuple val(idPatient), val(idSample), path("${idSample}.bam") output: - tuple idPatient, idSample, path("${idSample}.md.bam"), path("${idSample}.md.bam.bai"), emit: bam_duplicates_marked - tuple idPatient, idSample, emit: tsv_bam_duplicates_marked - path "${idSample}.bam.metrics", optional : true, emit: duplicates_marked_report //is optional , applies when skip_qc is used(not implemented yet) + tuple val(idPatient), val(idSample), path("${idSample}.md.bam"), path("${idSample}.md.bam.bai"), emit: bam_duplicates_marked + tuple val(idPatient), val(idSample), emit: tsv_bam_duplicates_marked + path "${idSample}.bam.metrics", optional : true, emit: duplicates_marked_report script: markdup_java_options = task.memory.toGiga() > 8 ? params.markdup_java_options : "\"-Xms" + (task.memory.toGiga() / 2).trunc() + "g -Xmx" + (task.memory.toGiga() - 1) + "g\"" - //metrics = 'markduplicates' in skip_qc ? '' : "-M ${idSample}.bam.metrics" - metrics = "-M ${idSample}.bam.metrics" + metrics = 'markduplicates' in params.skip_qc ? '' : "-M ${idSample}.bam.metrics" + if (params.no_gatk_spark) """ gatk --java-options ${markdup_java_options} \ diff --git a/modules/local/merge_mapped_bam.nf b/modules/local/merge_mapped_bam.nf index 8e70342bbd..66758c0d11 100644 --- a/modules/local/merge_mapped_bam.nf +++ b/modules/local/merge_mapped_bam.nf @@ -4,10 +4,10 @@ process MERGE_BAM_MAPPED { tag "${patient}-${sample}" input: - tuple patient, sample, run, path(bam), path(bai) + tuple val(patient), val(sample), val(run), path(bam), path(bai) output: - tuple patient, sample, path("${sample}.bam"), path("${sample}.bam.bai") + tuple val(patient), val(sample), path("${sample}.bam"), path("${sample}.bam.bai") script: """ diff --git a/modules/local/trim_galore.nf b/modules/local/trim_galore.nf index d0bf00ff88..a49ccda2d7 100644 --- a/modules/local/trim_galore.nf +++ b/modules/local/trim_galore.nf @@ -16,7 +16,7 @@ process TRIM_GALORE { output: path "*.{html,zip,txt}", emit: report - tuple idPatient, idSample, idRun, path("${idSample}_${idRun}_R1_val_1.fq.gz"), path("${idSample}_${idRun}_R2_val_2.fq.gz"), emit: trimmed_reads + tuple val(idPatient), val(idSample), val(idRun), path("${idSample}_${idRun}_R1_val_1.fq.gz"), path("${idSample}_${idRun}_R2_val_2.fq.gz"), emit: trimmed_reads script: // Calculate number of --cores for TrimGalore based on value of task.cpus From 9bb93fc6b3811ace3773bb13716fd6c1e084c875 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Wed, 22 Jul 2020 11:20:29 +0200 Subject: [PATCH 058/200] Bump nf version in badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 353d9e3194..29fab9a95d 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ > **An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing** -[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A520.06.0--edge-brightgreen.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A520.07.0--RC1-brightgreen.svg)](https://www.nextflow.io/) [![nf-core](https://img.shields.io/badge/nf--core-pipeline-brightgreen.svg)](https://nf-co.re/) [![DOI](https://zenodo.org/badge/184289291.svg)](https://zenodo.org/badge/latestdoi/184289291) From 5e288cad36e6155a8b2af0f9941eee3170bfcb62 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 23 Jul 2020 22:15:25 +0200 Subject: [PATCH 059/200] use meta map --- modules/local/bwamem2_mem.nf | 37 ++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/modules/local/bwamem2_mem.nf b/modules/local/bwamem2_mem.nf index 7e425534d7..2a1a7253ce 100644 --- a/modules/local/bwamem2_mem.nf +++ b/modules/local/bwamem2_mem.nf @@ -1,29 +1,38 @@ -params.bwa_options = "-K 100000000 -M" -params.sequencer = "ILLUMINA" - process BWAMEM2_MEM { - label 'CPUS_MAX' - - tag "${sample}_${run}" + tag "${meta.id}" + label 'process_high' - publishDir "${params.outdir}/bwamem2_mem", mode: 'copy' + publishDir "${params.outdir}/bwamem2/${meta.sample}", + mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith('.version.txt')) null + else filename } input: - tuple val(patient), val(sample), val(run), path(read1), path(read2) + tuple val(meta), path(reads) path bwa path fasta path fai + val options output: - tuple val(patient), val(sample), val(run), path("*.bam"), path("*.bai") + tuple val(meta), path("*.bam"), path("*.bai") script: CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : "" - readGroup = "@RG\\tID:${run}\\t${CN}PU:${run}\\tSM:${sample}\\tLB:${sample}\\tPL:${params.sequencer}" + readGroup = "@RG\\tID:${meta.run}\\t${CN}PU:${meta.run}\\tSM:${meta.sample}\\tLB:${meta.sample}\\tPL:ILLUMINA" + extra = meta.status == 1 ? "-B 3" : "" """ - bwa-mem2 mem ${params.bwa_options} -R \"${readGroup}\" -t ${task.cpus} \ - ${fasta} ${read1} ${read2} | \ - samtools sort --threads ${task.cpus} -m 2G - > ${sample}_${run}.bam - samtools index ${sample}_${run}.bam + bwa-mem2 mem \ + ${options.args_bwamem2} \ + -R \"${readGroup}\" \ + ${extra} \ + -t ${task.cpus} \ + ${fasta} ${reads} | \ + samtools sort --threads ${task.cpus} -m 2G - > ${meta.id}.bam + + samtools index ${meta.id}.bam + + echo \$(bwa-mem2 version 2>&1) > bwa-mem2.version.txt """ } \ No newline at end of file From 1672006c208d0cedec03b304c558adf0fe20822b Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 23 Jul 2020 22:16:08 +0200 Subject: [PATCH 060/200] define meta map for extract_fastq() --- modules/local/functions.nf | 46 +++++++++++++------------------------- 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/modules/local/functions.nf b/modules/local/functions.nf index d90e1bc024..584201ac0f 100644 --- a/modules/local/functions.nf +++ b/modules/local/functions.nf @@ -137,22 +137,6 @@ def extract_fastq_from_dir(pattern) { fastq } -// Extract gender and status from Channel -def extract_infos(channel) { - def genderMap = [:] - def statusMap = [:] - channel = channel.map{ it -> - def idPatient = it[0] - def gender = it[1] - def status = it[2] - def idSample = it[3] - genderMap[idPatient] = gender - statusMap[idPatient, idSample] = status - [idPatient] + it[3..-1] - } - [genderMap, statusMap, channel] -} - // Channeling the TSV file containing FASTQ or BAM // Format is: "subject gender status sample lane fastq1 fastq2" // or: "subject gender status sample lane bam" @@ -160,25 +144,27 @@ def extract_fastq(tsvFile) { Channel.from(tsvFile) .splitCsv(sep: '\t') .map { row -> - def idPatient = row[0] - def gender = row[1] - def status = return_status(row[2].toInteger()) - def idSample = row[3] - def idRun = row[4] - def file1 = return_file(row[5]) - def file2 = "null" - if (has_extension(file1, "fastq.gz") || has_extension(file1, "fq.gz") || has_extension(file1, "fastq") || has_extension(file1, "fq")) { + def meta = [:] + meta.patient = row[0] + meta.gender = row[1] + meta.status = return_status(row[2].toInteger()) + meta.sample = row[3] + meta.run = row[4] + meta.id = "${meta.sample}-${meta.run}" + read1 = return_file(row[5]) + read2 = "null" + if (has_extension(read1, "fastq.gz") || has_extension(read1, "fq.gz") || has_extension(read1, "fastq") || has_extension(read1, "fq")) { check_number_of_item(row, 7) - file2 = return_file(row[6]) - if (!has_extension(file2, "fastq.gz") && !has_extension(file2, "fq.gz") && !has_extension(file2, "fastq") && !has_extension(file2, "fq")) exit 1, "File: ${file2} has the wrong extension. See --help for more information" - if (has_extension(file1, "fastq") || has_extension(file1, "fq") || has_extension(file2, "fastq") || has_extension(file2, "fq")) { + read2 = return_file(row[6]) + if (!has_extension(read2, "fastq.gz") && !has_extension(read2, "fq.gz") && !has_extension(read2, "fastq") && !has_extension(read2, "fq")) exit 1, "File: ${file2} has the wrong extension. See --help for more information" + if (has_extension(read1, "fastq") || has_extension(read1, "fq") || has_extension(read2, "fastq") || has_extension(read2, "fq")) { exit 1, "We do recommend to use gziped fastq file to help you reduce your data footprint." } } - else if (has_extension(file1, "bam")) check_number_of_item(row, 6) - else "No recognisable extention for input file: ${file1}" + else if (has_extension(read1, "bam")) check_number_of_item(row, 6) + else "No recognisable extention for input file: ${read1}" - [idPatient, gender, status, idSample, idRun, file1, file2] + return [meta, [read1, read2]] } } From 6738fbbfcff25c34e1e1f553963140dffcd2465b Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 23 Jul 2020 22:17:35 +0200 Subject: [PATCH 061/200] use meta map --- modules/local/merge_mapped_bam.nf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/local/merge_mapped_bam.nf b/modules/local/merge_mapped_bam.nf index 66758c0d11..409993f068 100644 --- a/modules/local/merge_mapped_bam.nf +++ b/modules/local/merge_mapped_bam.nf @@ -1,17 +1,17 @@ process MERGE_BAM_MAPPED { - label 'cpus' + label 'cpus_8' - tag "${patient}-${sample}" + tag "${meta.id}" input: - tuple val(patient), val(sample), val(run), path(bam), path(bai) + tuple val(meta), path(bam), path(bai) output: - tuple val(patient), val(sample), path("${sample}.bam"), path("${sample}.bam.bai") + tuple val(meta), path("${meta.sample}.bam"), path("${meta.sample}.bam.bai") script: """ - samtools merge --threads ${task.cpus} ${sample}.bam ${bam} - samtools index ${sample}.bam + samtools merge --threads ${task.cpus} ${meta.sample}.bam ${bam} + samtools index ${meta.sample}.bam """ } \ No newline at end of file From a55315f838c63b9465b3dfeed4e3db0358967bac Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 23 Jul 2020 22:18:30 +0200 Subject: [PATCH 062/200] use meta map --- modules/nf-core/fastqc.nf | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/modules/nf-core/fastqc.nf b/modules/nf-core/fastqc.nf index c5408d179e..44810ce9ba 100644 --- a/modules/nf-core/fastqc.nf +++ b/modules/nf-core/fastqc.nf @@ -1,19 +1,31 @@ process FASTQC { - label 'FASTQC' + tag "${meta.id}" + label 'process_medium' label 'cpus_2' - tag "${idPatient}-${idRun}" + publishDir "${params.outdir}/Reports/${meta.sample}/FastQC/${meta.id}", + mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith('.version.txt')) null + else filename } - publishDir "${params.outdir}/Reports/${idSample}/FastQC/${idSample}_${idRun}", mode: params.publish_dir_mode + container "quay.io/biocontainers/fastqc:0.11.9--0" input: - tuple val(idPatient), val(idSample), val(idRun), path("${idSample}_${idRun}_R1.fastq.gz"), path("${idSample}_${idRun}_R2.fastq.gz") + tuple val(meta), path(reads) output: - path "*.{html,zip}" + tuple val(meta), path("*.html"), emit: html + tuple val(meta), path("*.zip"), emit: zip + path "*.version.txt", emit: version script: + prefix = "${meta.id}" """ - fastqc -t 2 -q ${idSample}_${idRun}_R1.fastq.gz ${idSample}_${idRun}_R2.fastq.gz + [ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz + fastqc --threads ${task.cpus} ${prefix}_1.fastq.gz ${prefix}_2.fastq.gz + + fastqc --version | sed -n "s/.*\\(v.*\$\\)/\\1/p" > fastqc.version.txt """ } From a65be18c74c80dde115df2dd907b595b1749420e Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 23 Jul 2020 22:18:47 +0200 Subject: [PATCH 063/200] update module --- modules/nf-core/multiqc.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/nf-core/multiqc.nf b/modules/nf-core/multiqc.nf index 701e1e1aa7..4be4ed44ff 100644 --- a/modules/nf-core/multiqc.nf +++ b/modules/nf-core/multiqc.nf @@ -9,7 +9,8 @@ process MULTIQC { publishDir "${params.outdir}/multiqc", mode: params.publish_dir_mode input: - path fastqc + path fastqc_html + path fastqc_zip path multiqc_config path multiqc_custom_config path software_versions From 71f32fd779d806c66f9b8172c738ce7f30661caa Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 23 Jul 2020 22:19:04 +0200 Subject: [PATCH 064/200] code polish --- modules/subworkflows/build_indices.nf | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/modules/subworkflows/build_indices.nf b/modules/subworkflows/build_indices.nf index f7868fb104..92934ee800 100644 --- a/modules/subworkflows/build_indices.nf +++ b/modules/subworkflows/build_indices.nf @@ -30,40 +30,33 @@ workflow BUILD_INDICES{ main: + result_bwa = Channel.empty() if (!(params.bwa) && params.fasta && 'mapping' in step) result_bwa = BWAMEM2_INDEX(fasta) - else - result_bwa = Channel.empty() + result_dict = Channel.empty() if (!(params.dict) && params.fasta && !('annotate' in step) && !('controlfreec' in step)) result_dict = GATK_CREATE_SEQUENCE_DICTIONARY(fasta) - else - result_dict = Channel.empty() + result_fai = Channel.empty() if (!(params.fasta_fai) && params.fasta && !('annotate' in step)) result_fai = SAMTOOLS_FAIDX(fasta) - else - result_fai = Channel.empty() + result_dbsnp_tbi = Channel.empty() if (!(params.dbsnp_index) && params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || 'tnscope' in tools)) result_dbsnp_tbi = HTSLIB_TABIX_DBSNP(dbsnp) - else - result_dbsnp_tbi = Channel.empty() + result_germline_resource_tbi = Channel.empty() if (!(params.germline_resource_index) && params.germline_resource && 'mutect2' in tools) result_germline_resource_tbi = HTSLIB_TABIX_GERMLINE_RESOURCE(germline_resource) - else - result_germline_resource_tbi = Channel.empty() + result_known_indels_tbi = Channel.empty() if (!(params.known_indels_index) && params.known_indels && ('mapping' in step || 'preparerecalibration' in step)) result_known_indels_tbi = HTSLIB_TABIX_KNOWN_INDELS(known_indels) - else - result_known_indels_tbi = Channel.empty() + result_pon_tbi = Channel.empty() if (!(params.pon_index) && params.pon && ('tnscope' in tools || 'mutect2' in tools)) result_pon_tbi = HTSLIB_TABIX_PON(pon) - else - result_pon_tbi = Channel.empty() if (params.no_intervals) { file("${params.outdir}/no_intervals.bed").text = "no_intervals\n" From 5229f73314e1d834d19d9166790c998d33d046cb Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 23 Jul 2020 22:19:20 +0200 Subject: [PATCH 065/200] update version --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index c2d00e6510..2214f1746f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,7 +11,7 @@ manifest { homePage = 'https://github.com/nf-core/sarek' description = 'An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing' mainScript = 'main.nf' - nextflowVersion = '>=19.10.0' + nextflowVersion = '>=20.07.0' version = '3.0dev' } From 9dce2a0153b51d0973345f7fe77334bab7d17feb Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 23 Jul 2020 22:41:41 +0200 Subject: [PATCH 066/200] fix output --- modules/nf-core/fastqc.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/nf-core/fastqc.nf b/modules/nf-core/fastqc.nf index 44810ce9ba..2533f70fa3 100644 --- a/modules/nf-core/fastqc.nf +++ b/modules/nf-core/fastqc.nf @@ -15,9 +15,9 @@ process FASTQC { tuple val(meta), path(reads) output: - tuple val(meta), path("*.html"), emit: html - tuple val(meta), path("*.zip"), emit: zip - path "*.version.txt", emit: version + path "*.html", emit: html + path "*.version.txt", emit: version + path "*.zip", emit: zip script: prefix = "${meta.id}" From aa224dc3d7b88aba5aa9bdf2092e6c8d09ee83ad Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 23 Jul 2020 22:42:05 +0200 Subject: [PATCH 067/200] use meta map --- main.nf | 119 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 77 insertions(+), 42 deletions(-) diff --git a/main.nf b/main.nf index e742ec8568..a920a9abc6 100644 --- a/main.nf +++ b/main.nf @@ -44,7 +44,6 @@ include { extract_bam; extract_fastq; extract_fastq_from_dir; - extract_infos; has_extension } from './modules/local/functions' @@ -66,12 +65,12 @@ multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_ output_docs = file("$baseDir/docs/output.md", checkIfExists: true) output_docs_images = file("$baseDir/docs/images/", checkIfExists: true) -// // Check if genome exists in the config file -// if (params.genomes && !params.genomes.containsKey(params.genome) && !params.igenomes_ignore) { -// exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" -// } else if (params.genomes && !params.genomes.containsKey(params.genome) && params.igenomes_ignore) { -// exit 1, "The provided genome '${params.genome}' is not available in the genomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" -// } +// Check if genome exists in the config file +if (params.genomes && !params.genomes.containsKey(params.genome) && !params.igenomes_ignore) { + exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" +} else if (params.genomes && !params.genomes.containsKey(params.genome) && params.igenomes_ignore) { + exit 1, "The provided genome '${params.genome}' is not available in the genomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" +} step_list = define_step_list() step = params.step ? params.step.toLowerCase().replaceAll('-', '').replaceAll('_', '') : '' @@ -167,8 +166,6 @@ if (tsv_path) { log.info "Trying automatic annotation on files in the VariantCalling/ directory" } else exit 1, 'No sample were defined, see --help' -(gender_map, status_map, input_sample) = extract_infos(input_sample) - // input_sample.dump(tag: 'input sample') /* @@ -199,7 +196,7 @@ params.snpeff_db = params.genome ? params.genomes[params.genome].s params.species = params.genome ? params.genomes[params.genome].species ?: false : false params.vep_cache_version = params.genome ? params.genomes[params.genome].vep_cache_version ?: false : false -// Initialize channels based on params +// Initialize file channels based on params chr_dir = params.chr_dir ?: Channel.empty() chr_length = params.chr_length ?: Channel.empty() dbsnp = params.dbsnp ?: Channel.empty() @@ -210,6 +207,8 @@ loci = params.ac_loci ?: Channel.empty() loci_gc = params.ac_loci_gc ?: Channel.empty() mappability = params.mappability ?: Channel.empty() pon = params.pon ?: Channel.empty() + +// Initialize value channels based on params snpeff_cache = params.snpeff_cache ?: Channel.empty() snpeff_db = params.snpeff_db ?: Channel.empty() snpeff_species = params.species ?: Channel.empty() @@ -253,7 +252,6 @@ if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works ================================================================================ */ - include { BWAMEM2_MEM } from './modules/local/bwamem2_mem.nf' include { GET_SOFTWARE_VERSIONS } from './modules/local/get_software_versions' include { OUTPUT_DOCUMENTATION } from './modules/local/output_documentation' @@ -347,63 +345,100 @@ workflow { pon_tbi = params.pon ? params.pon_index ?: BUILD_INDICES.out.pon_tbi : Channel.empty() // PREPROCESSING - if(!('fastqc' in skip_qc)) - result_fastqc = FASTQC(input_sample) - else - result_fastqc = Channel.empty() + + fastqc_html = Channel.empty() + fastqc_version = Channel.empty() + fastqc_zip = Channel.empty() + + if (!('fastqc' in skip_qc)) { + FASTQC(input_sample) + fastqc_html = FASTQC.out.html + fastqc_version = FASTQC.out.version + fastqc_zip = FASTQC.out.zip + } + + def bwamem2_mem_options = [:] + + bwamem2_mem_options.args_bwamem2 = "-K 100000000 -M" + trim_galore_report = Channel.empty() if (params.trim_fastq) { TRIM_GALORE(input_sample) - result_trim_galore = TRIM_GALORE.out.report - BWAMEM2_MEM(TRIM_GALORE.out.trimmed_reads, bwa, fasta, fai) - } - else { - result_trim_galore = Channel.empty() - BWAMEM2_MEM(input_sample, bwa, fasta, fai) + BWAMEM2_MEM(TRIM_GALORE.out.trimmed_reads, bwa, fasta, fai, bwamem2_mem_options) + trim_galore_report = TRIM_GALORE.out.report } - - BWAMEM2_MEM.out.groupTuple(by:[0, 1]) - .branch { - single: it[2].size() == 1 - multiple: it[2].size() > 1 + else BWAMEM2_MEM(input_sample, bwa, fasta, fai, bwamem2_mem_options) + + results = BWAMEM2_MEM.out.map{ meta, bam, bai -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + [patient, sample, gender, status, bam, bai] + }.groupTuple(by: [0,1]) + .branch{ + single: it[4].size() == 1 + multiple: it[4].size() > 1 }.set { bam } bam_single = bam.single.map { - idPatient, idSample, idRun, bam, bai -> - [idPatient, idSample, bam[0], bai[0]] + patient, sample, gender, status, bam, bai -> + + def meta = [:] + meta.patient = patient + meta.sample = sample + meta.gender = gender[0] + meta.status = status[0] + meta.id = sample + + [meta, bam[0], bai[0]] + } + + bam_multiple = bam.multiple.map { + patient, sample, gender, status, bam, bai -> + + def meta = [:] + meta.patient = patient + meta.sample = sample + meta.gender = gender[0] + meta.status = status[0] + meta.id = sample + + [meta, bam, bai] } - //multipleBam = multipleBam.mix(multipleBamSentieon) + // multipleBam = multipleBam.mix(multipleBamSentieon) - bam_mapped = bam_single.mix(MERGE_BAM_MAPPED(bam.multiple)) + bam_mapped = bam_single.mix(MERGE_BAM_MAPPED(bam_multiple)) bam_mapped.view() - if(!(params.skip_markduplicates)){ - MARK_DUPLICATES(bam_mapped) - mark_duplicates_report = MARK_DUPLICATES.out.duplicates_marked_report - bam_duplicates_marked = MARK_DUPLICATES.out.bam_duplicates_marked - } - else { - mark_duplicates_report = Channel.empty() - bam_duplicates_marked = Channel.empty() + mark_duplicates_report = Channel.empty() + bam_duplicates_marked = Channel.empty() + + if (!(params.skip_markduplicates)) { + // MARK_DUPLICATES(bam_mapped) + // mark_duplicates_report = MARK_DUPLICATES.out.report + // bam_duplicates_marked = MARK_DUPLICATES.out.bam } - bamBaseRecalibrator = bam_duplicates_marked.combine(BUILD_INDICES.out.intervals_bed) + // bamBaseRecalibrator = bam_duplicates_marked.combine(BUILD_INDICES.out.intervals_bed) - //BASE_RECALIBRATION(bamBaseRecalibrator,dbsnp, dbsnp_index,fasta,) + // //BASE_RECALIBRATION(bamBaseRecalibrator,dbsnp, dbsnp_index,fasta,) OUTPUT_DOCUMENTATION( output_docs, output_docs_images) GET_SOFTWARE_VERSIONS() + MULTIQC( - result_fastqc.collect().ifEmpty([]), + fastqc_html.collect().ifEmpty([]), + fastqc_zip.collect().ifEmpty([]), multiqc_config, multiqc_custom_config.ifEmpty([]), GET_SOFTWARE_VERSIONS.out.yml, - result_trim_galore.collect().ifEmpty([]), + trim_galore_report.collect().ifEmpty([]), mark_duplicates_report.collect().ifEmpty([]), workflow_summary) } From 9ea67af9c14a41bbb3929932eb223eef0dfb3fbe Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 23 Jul 2020 22:42:17 +0200 Subject: [PATCH 068/200] use meta map --- modules/local/mark_duplicates.nf | 85 +++++++++++++++++--------------- 1 file changed, 44 insertions(+), 41 deletions(-) diff --git a/modules/local/mark_duplicates.nf b/modules/local/mark_duplicates.nf index c2cae3d722..88b779a534 100644 --- a/modules/local/mark_duplicates.nf +++ b/modules/local/mark_duplicates.nf @@ -1,44 +1,47 @@ process MARK_DUPLICATES { - label 'cpus_16' - tag "${idPatient}-${idSample}" - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { - if (it == "${idSample}.bam.metrics") "Reports/${idSample}/MarkDuplicates/${it}" - else "Preprocessing/${idSample}/DuplicatesMarked/${it}" - } - input: - tuple val(idPatient), val(idSample), path("${idSample}.bam") - output: - tuple val(idPatient), val(idSample), path("${idSample}.md.bam"), path("${idSample}.md.bam.bai"), emit: bam_duplicates_marked - tuple val(idPatient), val(idSample), emit: tsv_bam_duplicates_marked - path "${idSample}.bam.metrics", optional : true, emit: duplicates_marked_report + label 'cpus_16' + tag "${id}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { + if (it == "${meta.sample}.bam.metrics") "Reports/${meta.sample}/MarkDuplicates/${it}" + else "Preprocessing/${meta.sample}/DuplicatesMarked/${it}" + } + + input: + tuple val(meta), path("${meta.sample}.bam"), path("${meta.sample}.bam.bai") + + output: + tuple val(meta), path("${meta.sample}.md.bam"), path("${meta.sample}.md.bam.bai"), emit: bam + tuple val(meta), emit: tsv + path "${meta.sample}.bam.metrics", optional : true, emit: report - script: - markdup_java_options = task.memory.toGiga() > 8 ? params.markdup_java_options : "\"-Xms" + (task.memory.toGiga() / 2).trunc() + "g -Xmx" + (task.memory.toGiga() - 1) + "g\"" - metrics = 'markduplicates' in params.skip_qc ? '' : "-M ${idSample}.bam.metrics" + script: + markdup_java_options = task.memory.toGiga() > 8 ? params.markdup_java_options : "\"-Xms" + (task.memory.toGiga() / 2).trunc() + "g -Xmx" + (task.memory.toGiga() - 1) + "g\"" + metrics = 'markduplicates' in params.skip_qc ? '' : "-M ${meta.sample}.bam.metrics" - if (params.no_gatk_spark) - """ - gatk --java-options ${markdup_java_options} \ - MarkDuplicates \ - --MAX_RECORDS_IN_RAM 50000 \ - --INPUT ${idSample}.bam \ - --METRICS_FILE ${idSample}.bam.metrics \ - --TMP_DIR . \ - --ASSUME_SORT_ORDER coordinate \ - --CREATE_INDEX true \ - --OUTPUT ${idSample}.md.bam - mv ${idSample}.md.bai ${idSample}.md.bam.bai - """ - else - """ - gatk --java-options ${markdup_java_options} \ - MarkDuplicatesSpark \ - -I ${idSample}.bam \ - -O ${idSample}.md.bam \ - ${metrics} \ - --tmp-dir . \ - --create-output-bam-index true \ - --spark-master local[${task.cpus}] - """ - } \ No newline at end of file + if (params.no_gatk_spark) + """ + gatk --java-options ${markdup_java_options} \ + MarkDuplicates \ + --MAX_RECORDS_IN_RAM 50000 \ + --INPUT ${meta.sample}.bam \ + --METRICS_FILE ${meta.sample}.bam.metrics \ + --TMP_DIR . \ + --ASSUME_SORT_ORDER coordinate \ + --CREATE_INDEX true \ + --OUTPUT ${meta.sample}.md.bam + mv ${meta.sample}.md.bai ${meta.sample}.md.bam.bai + """ + else + """ + gatk --java-options ${markdup_java_options} \ + MarkDuplicatesSpark \ + -I ${meta.sample}.bam \ + -O ${meta.sample}.md.bam \ + ${metrics} \ + --tmp-dir . \ + --create-output-bam-index true \ + --spark-master local[${task.cpus}] + """ +} \ No newline at end of file From 34d15f66187a71c7b4d8d0578c10e05fef9b9f72 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 24 Jul 2020 11:57:31 +0200 Subject: [PATCH 069/200] update with new folder structure --- main.nf | 20 +++++++++---------- modules/local/{ => custom}/functions.nf | 0 .../local/{ => process}/base_recalibration.nf | 0 .../local/{ => process}/build_intervals.nf | 0 modules/local/{ => process}/bwamem2_mem.nf | 0 .../{ => process}/create_intervals_bed.nf | 2 +- .../{ => process}/get_software_versions.nf | 0 .../local/{ => process}/merge_mapped_bam.nf | 0 .../{ => process}/output_documentation.nf | 0 .../subworkflow}/build_indices.nf | 12 +++++------ .../nf-core/{ => software}/bwamem2_index.nf | 0 modules/nf-core/{ => software}/fastqc.nf | 0 .../{local => nf-core/software}/gatk_dict.nf | 0 .../nf-core/{ => software}/htslib_tabix.nf | 0 .../software}/mark_duplicates.nf | 0 modules/nf-core/{ => software}/multiqc.nf | 0 .../nf-core/{ => software}/samtools_faidx.nf | 0 .../software}/trim_galore.nf | 0 18 files changed, 17 insertions(+), 17 deletions(-) rename modules/local/{ => custom}/functions.nf (100%) rename modules/local/{ => process}/base_recalibration.nf (100%) rename modules/local/{ => process}/build_intervals.nf (100%) rename modules/local/{ => process}/bwamem2_mem.nf (100%) rename modules/local/{ => process}/create_intervals_bed.nf (96%) rename modules/local/{ => process}/get_software_versions.nf (100%) rename modules/local/{ => process}/merge_mapped_bam.nf (100%) rename modules/local/{ => process}/output_documentation.nf (100%) rename modules/{subworkflows => local/subworkflow}/build_indices.nf (88%) rename modules/nf-core/{ => software}/bwamem2_index.nf (100%) rename modules/nf-core/{ => software}/fastqc.nf (100%) rename modules/{local => nf-core/software}/gatk_dict.nf (100%) rename modules/nf-core/{ => software}/htslib_tabix.nf (100%) rename modules/{local => nf-core/software}/mark_duplicates.nf (100%) rename modules/nf-core/{ => software}/multiqc.nf (100%) rename modules/nf-core/{ => software}/samtools_faidx.nf (100%) rename modules/{local => nf-core/software}/trim_galore.nf (100%) diff --git a/main.nf b/main.nf index a920a9abc6..2e3cc6cec8 100644 --- a/main.nf +++ b/main.nf @@ -45,7 +45,7 @@ include { extract_fastq; extract_fastq_from_dir; has_extension -} from './modules/local/functions' +} from './modules/local/custom/functions' /* ================================================================================ @@ -252,12 +252,10 @@ if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works ================================================================================ */ -include { BWAMEM2_MEM } from './modules/local/bwamem2_mem.nf' -include { GET_SOFTWARE_VERSIONS } from './modules/local/get_software_versions' -include { OUTPUT_DOCUMENTATION } from './modules/local/output_documentation' -include { TRIM_GALORE } from './modules/local/trim_galore.nf' -include { MERGE_BAM_MAPPED } from './modules/local/merge_mapped_bam' -include { MARK_DUPLICATES } from './modules/local/mark_duplicates' addParams(skip_qc: skip_qc) +include { BWAMEM2_MEM } from './modules/local/process/bwamem2_mem.nf' +include { GET_SOFTWARE_VERSIONS } from './modules/local/process/get_software_versions' +include { OUTPUT_DOCUMENTATION } from './modules/local/process/output_documentation' +include { MERGE_BAM_MAPPED } from './modules/local/process/merge_mapped_bam' //include { BASE_RECALIBRATION } from './modules/local/base_recalibration' params(params) /* @@ -266,7 +264,7 @@ include { MARK_DUPLICATES } from './modules/local/mark_duplicates' addPara ================================================================================ */ -include { BUILD_INDICES } from './modules/subworkflows/build_indices' +include { BUILD_INDICES } from './modules/local/subworkflow/build_indices' /* ================================================================================ @@ -274,8 +272,10 @@ include { BUILD_INDICES } from './modules/subworkflows/build_indices' ================================================================================ */ -include { FASTQC } from './modules/nf-core/fastqc' -include { MULTIQC } from './modules/nf-core/multiqc' +include { TRIM_GALORE } from './modules/nf-core/software/trim_galore.nf' +include { MARK_DUPLICATES } from './modules/nf-core/software/mark_duplicates' +include { FASTQC } from './modules/nf-core/software/fastqc' +include { MULTIQC } from './modules/nf-core/software/multiqc' // PREPARING CHANNELS FOR PREPROCESSING AND QC diff --git a/modules/local/functions.nf b/modules/local/custom/functions.nf similarity index 100% rename from modules/local/functions.nf rename to modules/local/custom/functions.nf diff --git a/modules/local/base_recalibration.nf b/modules/local/process/base_recalibration.nf similarity index 100% rename from modules/local/base_recalibration.nf rename to modules/local/process/base_recalibration.nf diff --git a/modules/local/build_intervals.nf b/modules/local/process/build_intervals.nf similarity index 100% rename from modules/local/build_intervals.nf rename to modules/local/process/build_intervals.nf diff --git a/modules/local/bwamem2_mem.nf b/modules/local/process/bwamem2_mem.nf similarity index 100% rename from modules/local/bwamem2_mem.nf rename to modules/local/process/bwamem2_mem.nf diff --git a/modules/local/create_intervals_bed.nf b/modules/local/process/create_intervals_bed.nf similarity index 96% rename from modules/local/create_intervals_bed.nf rename to modules/local/process/create_intervals_bed.nf index b073dd5b17..c4afd300c1 100644 --- a/modules/local/create_intervals_bed.nf +++ b/modules/local/process/create_intervals_bed.nf @@ -1,4 +1,4 @@ -include { has_extension } from './functions' +include { has_extension } from '../custom/functions' process CREATE_INTERVALS_BED { tag "${intervals}" diff --git a/modules/local/get_software_versions.nf b/modules/local/process/get_software_versions.nf similarity index 100% rename from modules/local/get_software_versions.nf rename to modules/local/process/get_software_versions.nf diff --git a/modules/local/merge_mapped_bam.nf b/modules/local/process/merge_mapped_bam.nf similarity index 100% rename from modules/local/merge_mapped_bam.nf rename to modules/local/process/merge_mapped_bam.nf diff --git a/modules/local/output_documentation.nf b/modules/local/process/output_documentation.nf similarity index 100% rename from modules/local/output_documentation.nf rename to modules/local/process/output_documentation.nf diff --git a/modules/subworkflows/build_indices.nf b/modules/local/subworkflow/build_indices.nf similarity index 88% rename from modules/subworkflows/build_indices.nf rename to modules/local/subworkflow/build_indices.nf index 92934ee800..106610305b 100644 --- a/modules/subworkflows/build_indices.nf +++ b/modules/local/subworkflow/build_indices.nf @@ -6,17 +6,17 @@ // And then initialize channels based on params or indexes that were just built -include { BUILD_INTERVALS } from '../local/build_intervals.nf' -include { BWAMEM2_INDEX } from '../nf-core/bwamem2_index.nf' -include { CREATE_INTERVALS_BED } from '../local/create_intervals_bed.nf' -include { GATK_CREATE_SEQUENCE_DICTIONARY } from '../local/gatk_dict.nf' +include { BUILD_INTERVALS } from '../../local/process/build_intervals.nf' +include { BWAMEM2_INDEX } from '../../nf-core/software/bwamem2_index.nf' +include { CREATE_INTERVALS_BED } from '../../local/process/create_intervals_bed.nf' +include { GATK_CREATE_SEQUENCE_DICTIONARY } from '../../nf-core/software/gatk_dict.nf' include { HTSLIB_TABIX as HTSLIB_TABIX_DBSNP; HTSLIB_TABIX as HTSLIB_TABIX_GERMLINE_RESOURCE; HTSLIB_TABIX as HTSLIB_TABIX_KNOWN_INDELS; HTSLIB_TABIX as HTSLIB_TABIX_PON; -} from '../nf-core/htslib_tabix' -include { SAMTOOLS_FAIDX } from '../nf-core/samtools_faidx.nf' +} from '../../nf-core/software/htslib_tabix' +include { SAMTOOLS_FAIDX } from '../../nf-core/software/samtools_faidx.nf' workflow BUILD_INDICES{ take: diff --git a/modules/nf-core/bwamem2_index.nf b/modules/nf-core/software/bwamem2_index.nf similarity index 100% rename from modules/nf-core/bwamem2_index.nf rename to modules/nf-core/software/bwamem2_index.nf diff --git a/modules/nf-core/fastqc.nf b/modules/nf-core/software/fastqc.nf similarity index 100% rename from modules/nf-core/fastqc.nf rename to modules/nf-core/software/fastqc.nf diff --git a/modules/local/gatk_dict.nf b/modules/nf-core/software/gatk_dict.nf similarity index 100% rename from modules/local/gatk_dict.nf rename to modules/nf-core/software/gatk_dict.nf diff --git a/modules/nf-core/htslib_tabix.nf b/modules/nf-core/software/htslib_tabix.nf similarity index 100% rename from modules/nf-core/htslib_tabix.nf rename to modules/nf-core/software/htslib_tabix.nf diff --git a/modules/local/mark_duplicates.nf b/modules/nf-core/software/mark_duplicates.nf similarity index 100% rename from modules/local/mark_duplicates.nf rename to modules/nf-core/software/mark_duplicates.nf diff --git a/modules/nf-core/multiqc.nf b/modules/nf-core/software/multiqc.nf similarity index 100% rename from modules/nf-core/multiqc.nf rename to modules/nf-core/software/multiqc.nf diff --git a/modules/nf-core/samtools_faidx.nf b/modules/nf-core/software/samtools_faidx.nf similarity index 100% rename from modules/nf-core/samtools_faidx.nf rename to modules/nf-core/software/samtools_faidx.nf diff --git a/modules/local/trim_galore.nf b/modules/nf-core/software/trim_galore.nf similarity index 100% rename from modules/local/trim_galore.nf rename to modules/nf-core/software/trim_galore.nf From 8a8c8aada7af74a9f47c6c79f9d2cb150a431b0a Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 24 Jul 2020 14:25:35 +0200 Subject: [PATCH 070/200] fix names --- main.nf | 10 +++---- modules/local/process/create_intervals_bed.nf | 2 +- modules/local/subworkflow/build_indices.nf | 30 +++++++++---------- 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/main.nf b/main.nf index 2e3cc6cec8..88a70777ea 100644 --- a/main.nf +++ b/main.nf @@ -45,7 +45,7 @@ include { extract_fastq; extract_fastq_from_dir; has_extension -} from './modules/local/custom/functions' +} from './modules/local/functions' /* ================================================================================ @@ -272,10 +272,10 @@ include { BUILD_INDICES } from './modules/local/subworkflow/build_indices' ================================================================================ */ -include { TRIM_GALORE } from './modules/nf-core/software/trim_galore.nf' -include { MARK_DUPLICATES } from './modules/nf-core/software/mark_duplicates' -include { FASTQC } from './modules/nf-core/software/fastqc' -include { MULTIQC } from './modules/nf-core/software/multiqc' +include { TRIMGALORE } from './modules/nf-core/software/trimgalore.nf' +include { GATK_MARKDUPLICATES as MARKDUPLICATES} from './modules/nf-core/software/gatk_markduplicates' +include { FASTQC } from './modules/nf-core/software/fastqc' +include { MULTIQC } from './modules/nf-core/software/multiqc' // PREPARING CHANNELS FOR PREPROCESSING AND QC diff --git a/modules/local/process/create_intervals_bed.nf b/modules/local/process/create_intervals_bed.nf index c4afd300c1..4e93264a92 100644 --- a/modules/local/process/create_intervals_bed.nf +++ b/modules/local/process/create_intervals_bed.nf @@ -1,4 +1,4 @@ -include { has_extension } from '../custom/functions' +include { has_extension } from '../functions' process CREATE_INTERVALS_BED { tag "${intervals}" diff --git a/modules/local/subworkflow/build_indices.nf b/modules/local/subworkflow/build_indices.nf index 106610305b..6e28c870b6 100644 --- a/modules/local/subworkflow/build_indices.nf +++ b/modules/local/subworkflow/build_indices.nf @@ -6,17 +6,15 @@ // And then initialize channels based on params or indexes that were just built -include { BUILD_INTERVALS } from '../../local/process/build_intervals.nf' -include { BWAMEM2_INDEX } from '../../nf-core/software/bwamem2_index.nf' -include { CREATE_INTERVALS_BED } from '../../local/process/create_intervals_bed.nf' -include { GATK_CREATE_SEQUENCE_DICTIONARY } from '../../nf-core/software/gatk_dict.nf' -include { - HTSLIB_TABIX as HTSLIB_TABIX_DBSNP; - HTSLIB_TABIX as HTSLIB_TABIX_GERMLINE_RESOURCE; - HTSLIB_TABIX as HTSLIB_TABIX_KNOWN_INDELS; - HTSLIB_TABIX as HTSLIB_TABIX_PON; -} from '../../nf-core/software/htslib_tabix' -include { SAMTOOLS_FAIDX } from '../../nf-core/software/samtools_faidx.nf' +include { BUILD_INTERVALS } from '../process/build_intervals.nf' +include { BWAMEM2_INDEX } from '../../nf-core/software/bwamem2_index.nf' +include { CREATE_INTERVALS_BED } from '../process/create_intervals_bed.nf' +include { GATK_CREATESEQUENCEDICTIONARY as GATK_DICT } from '../../nf-core/software/gatk_createsequencedictionary.nf' +include { HTSLIB_TABIX as TABIX_DBSNP; + HTSLIB_TABIX as TABIX_GERMLINE_RESOURCE; + HTSLIB_TABIX as TABIX_KNOWN_INDELS; + HTSLIB_TABIX as TABIX_PON;} from '../../nf-core/software/htslib_tabix' +include { SAMTOOLS_FAIDX } from '../../nf-core/software/samtools_faidx.nf' workflow BUILD_INDICES{ take: @@ -36,7 +34,7 @@ workflow BUILD_INDICES{ result_dict = Channel.empty() if (!(params.dict) && params.fasta && !('annotate' in step) && !('controlfreec' in step)) - result_dict = GATK_CREATE_SEQUENCE_DICTIONARY(fasta) + result_dict = GATK_DICT(fasta) result_fai = Channel.empty() if (!(params.fasta_fai) && params.fasta && !('annotate' in step)) @@ -44,19 +42,19 @@ workflow BUILD_INDICES{ result_dbsnp_tbi = Channel.empty() if (!(params.dbsnp_index) && params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || 'tnscope' in tools)) - result_dbsnp_tbi = HTSLIB_TABIX_DBSNP(dbsnp) + result_dbsnp_tbi = TABIX_DBSNP(dbsnp) result_germline_resource_tbi = Channel.empty() if (!(params.germline_resource_index) && params.germline_resource && 'mutect2' in tools) - result_germline_resource_tbi = HTSLIB_TABIX_GERMLINE_RESOURCE(germline_resource) + result_germline_resource_tbi = TABIX_GERMLINE_RESOURCE(germline_resource) result_known_indels_tbi = Channel.empty() if (!(params.known_indels_index) && params.known_indels && ('mapping' in step || 'preparerecalibration' in step)) - result_known_indels_tbi = HTSLIB_TABIX_KNOWN_INDELS(known_indels) + result_known_indels_tbi = TABIX_KNOWN_INDELS(known_indels) result_pon_tbi = Channel.empty() if (!(params.pon_index) && params.pon && ('tnscope' in tools || 'mutect2' in tools)) - result_pon_tbi = HTSLIB_TABIX_PON(pon) + result_pon_tbi = TABIX_PON(pon) if (params.no_intervals) { file("${params.outdir}/no_intervals.bed").text = "no_intervals\n" From 5dcf40481232c6832326c10c37600478048f334e Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 24 Jul 2020 14:44:25 +0200 Subject: [PATCH 071/200] rename files --- modules/local/{custom => }/functions.nf | 12 +++++------- ...gatk_dict.nf => gatk_createsequencedictionary.nf} | 2 +- .../{mark_duplicates.nf => gatk_markduplicates.nf} | 2 +- .../software/{trim_galore.nf => trimgalore.nf} | 2 +- 4 files changed, 8 insertions(+), 10 deletions(-) rename modules/local/{custom => }/functions.nf (96%) rename modules/nf-core/software/{gatk_dict.nf => gatk_createsequencedictionary.nf} (91%) rename modules/nf-core/software/{mark_duplicates.nf => gatk_markduplicates.nf} (98%) rename modules/nf-core/software/{trim_galore.nf => trimgalore.nf} (99%) diff --git a/modules/local/custom/functions.nf b/modules/local/functions.nf similarity index 96% rename from modules/local/custom/functions.nf rename to modules/local/functions.nf index 584201ac0f..1ef236bf85 100644 --- a/modules/local/custom/functions.nf +++ b/modules/local/functions.nf @@ -1,9 +1,7 @@ /* -================================================================================ - sarek functions -================================================================================ -*/ - + * This file holds several functions used to perform operation in Sarek + */ + // Check if a row has the expected number of item def check_number_of_item(row, number) { if (row.size() != number) exit 1, "Malformed row in TSV file: ${row}, see --help for more information" @@ -162,7 +160,7 @@ def extract_fastq(tsvFile) { } } else if (has_extension(read1, "bam")) check_number_of_item(row, 6) - else "No recognisable extention for input file: ${read1}" + else exit 1, "No recognisable extention for input file: ${read1}" return [meta, [read1, read2]] } @@ -258,4 +256,4 @@ def reduce_vcf(file) { def return_status(it) { if (!(it in [0, 1])) exit 1, "Status is not recognized in TSV file: ${it}, see --help for more information" return it -} +} \ No newline at end of file diff --git a/modules/nf-core/software/gatk_dict.nf b/modules/nf-core/software/gatk_createsequencedictionary.nf similarity index 91% rename from modules/nf-core/software/gatk_dict.nf rename to modules/nf-core/software/gatk_createsequencedictionary.nf index 9c811940c0..0b8baefdd9 100644 --- a/modules/nf-core/software/gatk_dict.nf +++ b/modules/nf-core/software/gatk_createsequencedictionary.nf @@ -1,4 +1,4 @@ -process GATK_CREATE_SEQUENCE_DICTIONARY { +process GATK_CREATESEQUENCEDICTIONARY { tag "${fasta}" publishDir params.outdir, mode: params.publish_dir_mode, diff --git a/modules/nf-core/software/mark_duplicates.nf b/modules/nf-core/software/gatk_markduplicates.nf similarity index 98% rename from modules/nf-core/software/mark_duplicates.nf rename to modules/nf-core/software/gatk_markduplicates.nf index 88b779a534..ba7e99099d 100644 --- a/modules/nf-core/software/mark_duplicates.nf +++ b/modules/nf-core/software/gatk_markduplicates.nf @@ -1,4 +1,4 @@ -process MARK_DUPLICATES { +process GATK_MARKDUPLICATES { label 'cpus_16' tag "${id}" diff --git a/modules/nf-core/software/trim_galore.nf b/modules/nf-core/software/trimgalore.nf similarity index 99% rename from modules/nf-core/software/trim_galore.nf rename to modules/nf-core/software/trimgalore.nf index a49ccda2d7..0cf805bb01 100644 --- a/modules/nf-core/software/trim_galore.nf +++ b/modules/nf-core/software/trimgalore.nf @@ -1,4 +1,4 @@ -process TRIM_GALORE { +process TRIMGALORE { label 'TrimGalore' tag "${idPatient}-${idRun}" From 62fcea56441213971922e0ba8441cec5a2cd4fc4 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 24 Jul 2020 14:44:51 +0200 Subject: [PATCH 072/200] add conf/modules.config file --- conf/module.config | 53 ++++++++++++++++++++++++++++++++++++++++++++++ nextflow.config | 3 +++ 2 files changed, 56 insertions(+) create mode 100644 conf/module.config diff --git a/conf/module.config b/conf/module.config new file mode 100644 index 0000000000..106bb282eb --- /dev/null +++ b/conf/module.config @@ -0,0 +1,53 @@ +/* + * ------------------------------------------------- + * Nextflow config file for module specific-options + * ------------------------------------------------- + */ + +params { + modules { + 'fastqc' { + args = "--quiet" + suffix = "" + publish_dir = "fastqc" + publish_results = "all" + } + 'trimgalore' { + args = "--fastqc" + suffix = "" + publish_dir = "trim_galore" + publish_results = "all" + } + 'bwamem2_index' { + args = "-a bwtsw" + suffix = "" + publish_dir = "genome/bwa_index" + publish_results = "all" + } + 'bwamem2_mem' { + args = "-M" + args2 = "-h -F 0x0100 -O BAM" + suffix = ".Lb" + publish_dir = "bwa/library" + publish_results = "all" + } + 'gatk_markduplicates' { + args = "ASSUME_SORTED=true REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT TMP_DIR=tmp" + suffix = ".mLb.mkD" + publish_dir = "bwa/mergedLibrary" + publish_results = "all" + } + 'get_software_versions' { + args = "" + suffix = "" + publish_dir = "pipeline_info" + publish_results = "all" + } + 'output_documentation' { + args = "" + suffix = "" + publish_dir = "pipeline_info" + publish_results = "all" + } + } +} diff --git a/nextflow.config b/nextflow.config index 2214f1746f..0a94a3559d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -119,6 +119,9 @@ process.container = 'nfcore/sarek:dev' // Load base.config by default for all pipelines includeConfig 'conf/base.config' +// Load modules.config by default for all pipelines +includeConfig 'conf/modules.config' + // Load nf-core custom profiles from different Institutions try { includeConfig "${params.custom_config_base}/nfcore_custom.config" From cb1eefdd2d3f0ffc0d3058a86c7b515012ca8fc1 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 24 Jul 2020 15:02:34 +0200 Subject: [PATCH 073/200] typo --- conf/{module.config => modules.config} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename conf/{module.config => modules.config} (100%) diff --git a/conf/module.config b/conf/modules.config similarity index 100% rename from conf/module.config rename to conf/modules.config From 4043b8d75d28549175cf180a763d2e2f0e183e07 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 24 Jul 2020 16:59:56 +0200 Subject: [PATCH 074/200] use qc_trim subworkflow for fastqc + trimgalore --- conf/modules.config | 2 +- main.nf | 75 ++++++++++++-------------- modules/nf-core/software/fastqc.nf | 14 +++-- modules/nf-core/software/trimgalore.nf | 73 ++++++++++++++----------- modules/nf-core/subworkflow/qc_trim.nf | 53 ++++++++++++++++++ 5 files changed, 139 insertions(+), 78 deletions(-) create mode 100644 modules/nf-core/subworkflow/qc_trim.nf diff --git a/conf/modules.config b/conf/modules.config index 106bb282eb..80daaf6af8 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -15,7 +15,7 @@ params { 'trimgalore' { args = "--fastqc" suffix = "" - publish_dir = "trim_galore" + publish_dir = "trimgalore" publish_results = "all" } 'bwamem2_index' { diff --git a/main.nf b/main.nf index 88a70777ea..1e18fdec30 100644 --- a/main.nf +++ b/main.nf @@ -252,7 +252,7 @@ if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works ================================================================================ */ -include { BWAMEM2_MEM } from './modules/local/process/bwamem2_mem.nf' +include { BWAMEM2_MEM } from './modules/local/process/bwamem2_mem' include { GET_SOFTWARE_VERSIONS } from './modules/local/process/get_software_versions' include { OUTPUT_DOCUMENTATION } from './modules/local/process/output_documentation' include { MERGE_BAM_MAPPED } from './modules/local/process/merge_mapped_bam' @@ -272,11 +272,17 @@ include { BUILD_INDICES } from './modules/local/subworkflow/build_indices' ================================================================================ */ -include { TRIMGALORE } from './modules/nf-core/software/trimgalore.nf' include { GATK_MARKDUPLICATES as MARKDUPLICATES} from './modules/nf-core/software/gatk_markduplicates' -include { FASTQC } from './modules/nf-core/software/fastqc' include { MULTIQC } from './modules/nf-core/software/multiqc' +/* +================================================================================ + INCLUDE nf-core PIPELINE SUBWORKFLOWS +================================================================================ +*/ + +include { QC_TRIM } from './modules/nf-core/subworkflow/qc_trim' + // PREPARING CHANNELS FOR PREPROCESSING AND QC // input_bam = Channel.empty() @@ -346,28 +352,15 @@ workflow { // PREPROCESSING - fastqc_html = Channel.empty() - fastqc_version = Channel.empty() - fastqc_zip = Channel.empty() + QC_TRIM( + input_sample, + ('fastqc' in skip_qc), + !(params.trim_fastq), + params.modules['fastqc'], + params.modules['trimgalore'] + ) - if (!('fastqc' in skip_qc)) { - FASTQC(input_sample) - fastqc_html = FASTQC.out.html - fastqc_version = FASTQC.out.version - fastqc_zip = FASTQC.out.zip - } - - def bwamem2_mem_options = [:] - - bwamem2_mem_options.args_bwamem2 = "-K 100000000 -M" - trim_galore_report = Channel.empty() - - if (params.trim_fastq) { - TRIM_GALORE(input_sample) - BWAMEM2_MEM(TRIM_GALORE.out.trimmed_reads, bwa, fasta, fai, bwamem2_mem_options) - trim_galore_report = TRIM_GALORE.out.report - } - else BWAMEM2_MEM(input_sample, bwa, fasta, fai, bwamem2_mem_options) + BWAMEM2_MEM(QC_TRIM.out.reads, bwa, fasta, fai, params.modules['bwamem2_mem']) results = BWAMEM2_MEM.out.map{ meta, bam, bai -> patient = meta.patient @@ -422,25 +415,25 @@ workflow { // bam_duplicates_marked = MARK_DUPLICATES.out.bam } - // bamBaseRecalibrator = bam_duplicates_marked.combine(BUILD_INDICES.out.intervals_bed) +// // bamBaseRecalibrator = bam_duplicates_marked.combine(BUILD_INDICES.out.intervals_bed) - // //BASE_RECALIBRATION(bamBaseRecalibrator,dbsnp, dbsnp_index,fasta,) +// // //BASE_RECALIBRATION(bamBaseRecalibrator,dbsnp, dbsnp_index,fasta,) - OUTPUT_DOCUMENTATION( - output_docs, - output_docs_images) - - GET_SOFTWARE_VERSIONS() - - MULTIQC( - fastqc_html.collect().ifEmpty([]), - fastqc_zip.collect().ifEmpty([]), - multiqc_config, - multiqc_custom_config.ifEmpty([]), - GET_SOFTWARE_VERSIONS.out.yml, - trim_galore_report.collect().ifEmpty([]), - mark_duplicates_report.collect().ifEmpty([]), - workflow_summary) +// OUTPUT_DOCUMENTATION( +// output_docs, +// output_docs_images) + +// GET_SOFTWARE_VERSIONS() + +// MULTIQC( +// fastqc_html.collect().ifEmpty([]), +// fastqc_zip.collect().ifEmpty([]), +// multiqc_config, +// multiqc_custom_config.ifEmpty([]), +// GET_SOFTWARE_VERSIONS.out.yml, +// trimgalore_report.collect().ifEmpty([]), +// markduplicates_report.collect().ifEmpty([]), +// workflow_summary) } /* diff --git a/modules/nf-core/software/fastqc.nf b/modules/nf-core/software/fastqc.nf index 2533f70fa3..2f1fe732be 100644 --- a/modules/nf-core/software/fastqc.nf +++ b/modules/nf-core/software/fastqc.nf @@ -3,16 +3,18 @@ process FASTQC { label 'process_medium' label 'cpus_2' - publishDir "${params.outdir}/Reports/${meta.sample}/FastQC/${meta.id}", + publishDir "${params.outdir}/${options.publish_dir}", mode: params.publish_dir_mode, saveAs: { filename -> - if (filename.endsWith('.version.txt')) null + if (options.publish_results == "none") null + else if (filename.endsWith('.version.txt')) null else filename } container "quay.io/biocontainers/fastqc:0.11.9--0" input: tuple val(meta), path(reads) + val options output: path "*.html", emit: html @@ -20,12 +22,14 @@ process FASTQC { path "*.zip", emit: zip script: - prefix = "${meta.id}" + // Add soft-links to original FastQs for consistent naming in pipeline + prefix = options.suffix ? "${meta.id}.${options.suffix}" : "${meta.id}" """ [ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz [ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz - fastqc --threads ${task.cpus} ${prefix}_1.fastq.gz ${prefix}_2.fastq.gz + + fastqc ${options.args} --threads ${task.cpus} ${prefix}_1.fastq.gz ${prefix}_2.fastq.gz fastqc --version | sed -n "s/.*\\(v.*\$\\)/\\1/p" > fastqc.version.txt """ -} +} \ No newline at end of file diff --git a/modules/nf-core/software/trimgalore.nf b/modules/nf-core/software/trimgalore.nf index 0cf805bb01..ad4cc1e5d3 100644 --- a/modules/nf-core/software/trimgalore.nf +++ b/modules/nf-core/software/trimgalore.nf @@ -1,22 +1,26 @@ process TRIMGALORE { - label 'TrimGalore' + tag "${meta.id}" + label 'process_high' - tag "${idPatient}-${idRun}" + publishDir "${params.outdir}/${options.publish_dir}", + mode: params.publish_dir_mode, + saveAs: { filename -> + if (options.publish_results == "none") null + else if (filename.endsWith('.version.txt')) null + else filename } - publishDir "${params.outdir}/Reports/${idSample}/TrimGalore/${idSample}_${idRun}", mode: params.publish_dir_mode, - saveAs: {filename -> - if (filename.indexOf("_fastqc") > 0) "FastQC/${filename}" - else if (filename.indexOf("trimming_report.txt") > 0) "logs/${filename}" - else if (params.save_trimmed) filename - else null - } + container "quay.io/biocontainers/trim-galore:0.6.5--0" input: - tuple val(idPatient), val(idSample), val(idRun), path("${idSample}_${idRun}_R1.fastq.gz"), path("${idSample}_${idRun}_R2.fastq.gz") + tuple val(meta), path(reads) + val options output: - path "*.{html,zip,txt}", emit: report - tuple val(idPatient), val(idSample), val(idRun), path("${idSample}_${idRun}_R1_val_1.fq.gz"), path("${idSample}_${idRun}_R2_val_2.fq.gz"), emit: trimmed_reads + tuple val(meta), path("*_1.fq.gz"), path("*_2.fq.gz"), emit: reads + path "*.html" , emit: html optional true + path "*.txt" , emit: log + path "*.version.txt", emit: version + path "*.zip" , emit: zip optional true script: // Calculate number of --cores for TrimGalore based on value of task.cpus @@ -24,29 +28,36 @@ process TRIMGALORE { // See: https://github.com/nf-core/atacseq/pull/65 def cores = 1 if (task.cpus) { - cores = (task.cpus as int) - 4 - if (cores < 1) cores = 1 - if (cores > 4) cores = 4 - } + cores = (task.cpus as int) - 4 + if (meta.single_end) cores = (task.cpus as int) - 3 + if (cores < 1) cores = 1 + if (cores > 4) cores = 4 + } + + // Clipping presets have to be evaluated in the context of SE/PE c_r1 = params.clip_r1 > 0 ? "--clip_r1 ${params.clip_r1}" : '' c_r2 = params.clip_r2 > 0 ? "--clip_r2 ${params.clip_r2}" : '' tpc_r1 = params.three_prime_clip_r1 > 0 ? "--three_prime_clip_r1 ${params.three_prime_clip_r1}" : '' tpc_r2 = params.three_prime_clip_r2 > 0 ? "--three_prime_clip_r2 ${params.three_prime_clip_r2}" : '' - nextseq = params.trim_nextseq > 0 ? "--nextseq ${params.trim_nextseq}" : '' + + // Added soft-links to original fastqs for consistent naming in MultiQC + prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" """ - trim_galore \ - --cores ${cores} \ - --paired \ - --fastqc \ - --gzip \ - ${c_r1} ${c_r2} \ - ${tpc_r1} ${tpc_r2} \ - ${nextseq} \ - ${idSample}_${idRun}_R1.fastq.gz ${idSample}_${idRun}_R2.fastq.gz - - mv *val_1_fastqc.html "${idSample}_${idRun}_R1.trimmed_fastqc.html" - mv *val_2_fastqc.html "${idSample}_${idRun}_R2.trimmed_fastqc.html" - mv *val_1_fastqc.zip "${idSample}_${idRun}_R1.trimmed_fastqc.zip" - mv *val_2_fastqc.zip "${idSample}_${idRun}_R2.trimmed_fastqc.zip" + [ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz + + trim_galore \\ + ${options.args} \\ + --cores ${cores} \\ + --paired \\ + --gzip \\ + ${c_r1} \\ + ${c_r2} \\ + ${tpc_r1} \\ + ${tpc_r2} \\ + ${prefix}_1.fastq.gz \\ + ${prefix}_2.fastq.gz + + trim_galore --version > trim_galore.version.txt """ } diff --git a/modules/nf-core/subworkflow/qc_trim.nf b/modules/nf-core/subworkflow/qc_trim.nf new file mode 100644 index 0000000000..d9fedc1250 --- /dev/null +++ b/modules/nf-core/subworkflow/qc_trim.nf @@ -0,0 +1,53 @@ +/* + * Read QC and trimming + */ + +include { FASTQC } from '../software/fastqc' +include { TRIMGALORE } from '../software/trimgalore' + +workflow QC_TRIM { + take: + + reads // channel: [ val(meta), [ reads ] ] + skip_fastqc // boolean: true/false + skip_trimming // boolean: true/false + fastqc_opts // map: options for FastQC module + trimgalore_opts // map: options for TrimGalore! module + + main: + + fastqc_html = Channel.empty() + fastqc_version = Channel.empty() + fastqc_zip = Channel.empty() + if (!skip_fastqc) { + FASTQC(reads, fastqc_opts) + fastqc_html = FASTQC.out.html + fastqc_version = FASTQC.out.version + fastqc_zip = FASTQC.out.zip + } + + trim_reads = reads + trimgalore_html = Channel.empty() + trimgalore_zip = Channel.empty() + trimgalore_log = Channel.empty() + trimgalore_version = Channel.empty() + if (!skip_trimming) { + TRIMGALORE(reads, trimgalore_opts) + trim_reads = TRIMGALORE.out.reads + trimgalore_html = TRIMGALORE.out.html + trimgalore_zip = TRIMGALORE.out.zip + trimgalore_log = TRIMGALORE.out.log + trimgalore_version = TRIMGALORE.out.version + } + + emit: + + fastqc_html // path: *.html + fastqc_zip // path: *.zip + fastqc_version // path: *.version.txt + reads = trim_reads // channel: [ val(meta), [ reads ] ] + trimgalore_html // path: *.html + trimgalore_zip // path: *.zip + trimgalore_log // path: *.txt + trimgalore_version // path: *.version.txt +} From 926c4e9db446eee9d60e68f71e627adfce8cd2a4 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 24 Jul 2020 17:17:37 +0200 Subject: [PATCH 075/200] back to markduplicates --- main.nf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/main.nf b/main.nf index 1e18fdec30..15106b0ece 100644 --- a/main.nf +++ b/main.nf @@ -406,16 +406,16 @@ workflow { bam_mapped.view() - mark_duplicates_report = Channel.empty() - bam_duplicates_marked = Channel.empty() + markduplicates_report = Channel.empty() + markduplicates_bam = Channel.empty() if (!(params.skip_markduplicates)) { - // MARK_DUPLICATES(bam_mapped) - // mark_duplicates_report = MARK_DUPLICATES.out.report - // bam_duplicates_marked = MARK_DUPLICATES.out.bam + // MARKDUPLICATES(bam_mapped) + // markduplicates_report = MARKDUPLICATES.out.report + // markduplicates_bam = MARKDUPLICATES.out.bam } -// // bamBaseRecalibrator = bam_duplicates_marked.combine(BUILD_INDICES.out.intervals_bed) +// // bamBaseRecalibrator = markduplicates_bam.combine(BUILD_INDICES.out.intervals_bed) // // //BASE_RECALIBRATION(bamBaseRecalibrator,dbsnp, dbsnp_index,fasta,) From 6e5f2daba267b01a222fabd66af01240422fccda Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 24 Jul 2020 18:30:07 +0200 Subject: [PATCH 076/200] code polishing --- lib/Schema.groovy | 26 +++++++++++----------- main.nf | 17 +++++++------- modules/local/subworkflow/build_indices.nf | 2 +- 3 files changed, 22 insertions(+), 23 deletions(-) diff --git a/lib/Schema.groovy b/lib/Schema.groovy index 20f49d03e0..6b7432fa4f 100644 --- a/lib/Schema.groovy +++ b/lib/Schema.groovy @@ -78,7 +78,7 @@ class JSON { return output } - private static LinkedHashMap params_summary(workflow, params, run_name) { + private static LinkedHashMap params_summary(workflow, params, run_name, step, tools, skip_qc, annotate_tools) { def Map summary = [:] if (workflow.revision) summary['Pipeline Release'] = workflow.revision summary['Run Name'] = run_name ?: workflow.runName @@ -86,14 +86,14 @@ class JSON { if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" summary['Input'] = params.input - // summary['Step'] = step + summary['Step'] = step summary['Genome'] = params.genome - // if (params.no_intervals && step != 'annotate') summary['Intervals'] = 'Do not use' + if (params.no_intervals && step != 'annotate') summary['Intervals'] = 'Do not use' summary['Nucleotides/s'] = params.nucleotides_per_second if (params.sentieon) summary['Sention'] = "Using Sentieon for Preprocessing and/or Variant Calling" - // if (params.skip_qc) summary['QC tools skipped'] = skip_qc.join(', ') + if (params.skip_qc) summary['QC tools skipped'] = skip_qc.join(', ') if (params.target_bed) summary['Target BED'] = params.target_bed - // if (params.tools) summary['Tools'] = tools.join(', ') + if (params.tools) summary['Tools'] = tools.join(', ') if (params.trim_fastq || params.split_fastq) summary['Modify fastqs'] = "trim and/or split" if (params.trim_fastq) { @@ -114,24 +114,24 @@ class JSON { summary['Save BAMs mapped'] = params.save_bam_mapped ? 'Yes' : 'No' summary['Skip MarkDuplicates'] = params.skip_markduplicates ? 'Yes' : 'No' - // if ('ascat' in tools) { + if ('ascat' in tools) { summary['ASCAT'] = "Options" if (params.ascat_purity) summary['purity'] = params.ascat_purity if (params.ascat_ploidy) summary['ploidy'] = params.ascat_ploidy - // } + } - // if ('controlfreec' in tools) { + if ('controlfreec' in tools) { summary['Control-FREEC'] = "Options" if (params.cf_window) summary['window'] = params.cf_window if (params.cf_coeff) summary['coeff of variation'] = params.cf_coeff if (params.cf_ploidy) summary['ploidy'] = params.cf_ploidy - // } + } - // if ('haplotypecaller' in tools) summary['GVCF'] = params.no_gvcf ? 'No' : 'Yes' - // if ('strelka' in tools && 'manta' in tools) summary['Strelka BP'] = params.no_strelka_bp ? 'No' : 'Yes' - // if (params.pon && ('mutect2' in tools || (params.sentieon && 'tnscope' in tools))) summary['Panel of normals'] = params.pon + if ('haplotypecaller' in tools) summary['GVCF'] = params.no_gvcf ? 'No' : 'Yes' + if ('strelka' in tools && 'manta' in tools) summary['Strelka BP'] = params.no_strelka_bp ? 'No' : 'Yes' + if (params.pon && ('mutect2' in tools || (params.sentieon && 'tnscope' in tools))) summary['Panel of normals'] = params.pon - // if (params.annotate_tools) summary['Tools to annotate'] = annotate_tools.join(', ') + if (params.annotate_tools) summary['Tools to annotate'] = annotate_tools.join(', ') if (params.annotation_cache) { summary['Annotation cache'] = "Enabled" diff --git a/main.nf b/main.nf index 15106b0ece..569c37e67c 100644 --- a/main.nf +++ b/main.nf @@ -39,6 +39,7 @@ include { check_parameter_existence; check_parameter_list; define_skip_qc_list; + define_anno_list; define_step_list; define_tool_list; extract_bam; @@ -90,9 +91,9 @@ skip_qc_list = define_skip_qc_list() skip_qc = params.skip_qc ? params.skip_qc == 'all' ? skip_qc_list : params.skip_qc.split(',').collect{it.trim().toLowerCase().replaceAll('-', '').replaceAll('_', '')} : [] if (!check_parameter_list(skip_qc, skip_qc_list)) exit 1, 'Unknown QC tool(s), see --help for more information' -// anno_list = define_anno_list() -// annotate_tools = params.annotate_tools ? params.annotate_tools.split(',').collect{it.trim().toLowerCase().replaceAll('-', '')} : [] -// if (!check_parameter_list(annotate_tools,anno_list)) exit 1, 'Unknown tool(s) to annotate, see --help for more information' +anno_list = define_anno_list() +annotate_tools = params.annotate_tools ? params.annotate_tools.split(',').collect{it.trim().toLowerCase().replaceAll('-', '')} : [] +if (!check_parameter_list(annotate_tools,anno_list)) exit 1, 'Unknown tool(s) to annotate, see --help for more information' // // Check parameters // if ((params.ascat_ploidy && !params.ascat_purity) || (!params.ascat_ploidy && params.ascat_purity)) exit 1, 'Please specify both --ascat_purity and --ascat_ploidy, or none of them' @@ -234,7 +235,7 @@ run_name = params.name if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { run_name = workflow.runName } -summary = Schema.params_summary(workflow, params, run_name) +summary = Schema.params_summary(workflow, params, run_name, step, tools, skip_qc, annotate_tools) log.info Headers.nf_core(workflow, params.monochrome_logs) log.info summary.collect { k,v -> "${k.padRight(20)}: $v" }.join("\n") log.info "-\033[2m----------------------------------------------------\033[0m-" @@ -344,9 +345,9 @@ workflow { bwa = params.bwa ?: BUILD_INDICES.out.bwa dbsnp_tbi = params.dbsnp ? params.dbsnp_index ?: BUILD_INDICES.out.dbsnp_tbi : Channel.empty() dict = params.dict ?: BUILD_INDICES.out.dict - fai = params.fasta_fai ? params.fasta_fai : BUILD_INDICES.out.fai + fai = params.fasta_fai ?: BUILD_INDICES.out.fai germline_resource_tbi = params.germline_resource ? params.germline_resource_index ?: BUILD_INDICES.out.germline_resource_tbi : Channel.empty() - intervals_bed = BUILD_INDICES.out.intervals_bed + intervals = BUILD_INDICES.out.intervals known_indels_tbi = params.known_indels ? params.known_indels_index ?: BUILD_INDICES.out.known_indels_tbi.collect() : Channel.empty() pon_tbi = params.pon ? params.pon_index ?: BUILD_INDICES.out.pon_tbi : Channel.empty() @@ -404,8 +405,6 @@ workflow { bam_mapped = bam_single.mix(MERGE_BAM_MAPPED(bam_multiple)) - bam_mapped.view() - markduplicates_report = Channel.empty() markduplicates_bam = Channel.empty() @@ -415,7 +414,7 @@ workflow { // markduplicates_bam = MARKDUPLICATES.out.bam } -// // bamBaseRecalibrator = markduplicates_bam.combine(BUILD_INDICES.out.intervals_bed) +// // bamBaseRecalibrator = markduplicates_bam.combine(BUILD_INDICES.out.intervals) // // //BASE_RECALIBRATION(bamBaseRecalibrator,dbsnp, dbsnp_index,fasta,) diff --git a/modules/local/subworkflow/build_indices.nf b/modules/local/subworkflow/build_indices.nf index 6e28c870b6..4c5a23841c 100644 --- a/modules/local/subworkflow/build_indices.nf +++ b/modules/local/subworkflow/build_indices.nf @@ -90,7 +90,7 @@ workflow BUILD_INDICES{ dict = result_dict fai = result_fai germline_resource_tbi = result_germline_resource_tbi - intervals_bed = result_intervals + intervals = result_intervals known_indels_tbi = result_known_indels_tbi pon_tbi = result_pon_tbi } From 50fa708ad628d0d6d6caf59d7ebf30374e51e5ec Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 24 Jul 2020 19:37:13 +0200 Subject: [PATCH 077/200] code polishing --- main.nf | 36 ++++++++++++++++++++---------------- nextflow.config | 6 ++++-- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/main.nf b/main.nf index 569c37e67c..38c9b4cd73 100644 --- a/main.nf +++ b/main.nf @@ -38,8 +38,8 @@ if (params.help) { include { check_parameter_existence; check_parameter_list; - define_skip_qc_list; define_anno_list; + define_skip_qc_list; define_step_list; define_tool_list; extract_bam; @@ -96,15 +96,16 @@ annotate_tools = params.annotate_tools ? params.annotate_tools.split(',').collec if (!check_parameter_list(annotate_tools,anno_list)) exit 1, 'Unknown tool(s) to annotate, see --help for more information' // // Check parameters -// if ((params.ascat_ploidy && !params.ascat_purity) || (!params.ascat_ploidy && params.ascat_purity)) exit 1, 'Please specify both --ascat_purity and --ascat_ploidy, or none of them' -// if (params.cf_window && params.cf_coeff) exit 1, 'Please specify either --cf_window OR --cf_coeff, but not both of them' +if ((params.ascat_ploidy && !params.ascat_purity) || (!params.ascat_ploidy && params.ascat_purity)) exit 1, 'Please specify both --ascat_purity and --ascat_ploidy, or none of them' +if (params.cf_window && params.cf_coeff) exit 1, 'Please specify either --cf_window OR --cf_coeff, but not both of them' +if (params.umi && !(params.read_structure1 && params.read_structure2)) exit 1, 'Please specify both --read_structure1 and --read_structure2, when using --umi' // Handle input tsv_path = null if (params.input && (has_extension(params.input, "tsv") || has_extension(params.input, "vcf") || has_extension(params.input, "vcf.gz"))) tsv_path = params.input if (params.input && (has_extension(params.input, "vcf") || has_extension(params.input, "vcf.gz"))) step = "annotate" -// save_bam_mapped = params.skip_markduplicates ? true : params.save_bam_mapped ? true : false +save_bam_mapped = params.skip_markduplicates ? true : params.save_bam_mapped ? true : false // If no input file specified, trying to get TSV files corresponding to step in the TSV directory // only for steps preparerecalibration, recalibrate, variantcalling and controlfreec @@ -176,7 +177,6 @@ if (tsv_path) { */ // Initialize each params in params.genomes, catch the command line first if it was defined - params.ac_loci = params.genome ? params.genomes[params.genome].ac_loci ?: false : false params.ac_loci_gc = params.genome ? params.genomes[params.genome].ac_loci_gc ?: false : false params.bwa = params.genome ? params.genomes[params.genome].bwa ?: false : false @@ -197,7 +197,7 @@ params.snpeff_db = params.genome ? params.genomes[params.genome].s params.species = params.genome ? params.genomes[params.genome].species ?: false : false params.vep_cache_version = params.genome ? params.genomes[params.genome].vep_cache_version ?: false : false -// Initialize file channels based on params +// Initialize file channels based on params, defined in the params.genomes[params.genome] scope chr_dir = params.chr_dir ?: Channel.empty() chr_length = params.chr_length ?: Channel.empty() dbsnp = params.dbsnp ?: Channel.empty() @@ -207,21 +207,25 @@ known_indels = params.known_indels ?: Channel.empty() loci = params.ac_loci ?: Channel.empty() loci_gc = params.ac_loci_gc ?: Channel.empty() mappability = params.mappability ?: Channel.empty() -pon = params.pon ?: Channel.empty() -// Initialize value channels based on params -snpeff_cache = params.snpeff_cache ?: Channel.empty() +// Initialize value channels based on params, defined in the params.genomes[params.genome] scope snpeff_db = params.snpeff_db ?: Channel.empty() snpeff_species = params.species ?: Channel.empty() -vep_cache = params.vep_cache ?: Channel.empty() vep_cache_version = params.vep_cache_version ?: Channel.empty() -// Optional files, not defined within the params.genomes[params.genome] scope -cadd_indels = params.cadd_indels ?: Channel.empty() -cadd_indels_tbi = params.cadd_indels_tbi ?: Channel.empty() -cadd_wg_snvs = params.cadd_wg_snvs ?: Channel.empty() -cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ?: Channel.empty() -target_bed = params.target_bed ?: Channel.empty() +// Initialize files channels based on params, not defined within the params.genomes[params.genome] scope +cadd_indels = params.cadd_indels ?: Channel.empty() +cadd_indels_tbi = params.cadd_indels_tbi ?: Channel.empty() +cadd_wg_snvs = params.cadd_wg_snvs ?: Channel.empty() +cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ?: Channel.empty() +pon = params.pon ?: Channel.empty() +snpeff_cache = params.snpeff_cache ?: Channel.empty() +target_bed = params.target_bed ?: Channel.empty() +vep_cache = params.vep_cache ?: Channel.empty() + +// Initialize value channels based on params, not defined within the params.genomes[params.genome] scope +read_structure1 = params.read_structure1 ?: Channel.empty() +read_structure2 = params.read_structure2 ?: Channel.empty() /* ================================================================================ diff --git a/nextflow.config b/nextflow.config index 0a94a3559d..3aa1351217 100644 --- a/nextflow.config +++ b/nextflow.config @@ -66,7 +66,10 @@ params { no_strelka_bp = null // Strelka will use Manta candidateSmallIndels if available pon = false // No default PON (Panel of Normals) file for GATK Mutect2 / Sentieon TNscope pon_index = false // No default PON index for GATK Mutect2 / Sentieon TNscope - ignore_soft_clipped_bases = null // GATK Mutect2 + ignore_soft_clipped_bases = null // no --dont-use-soft-clipped-bases for GATK Mutect2 + umi = null // no umi + read_structure1 = null // no umi + read_structure2 = null // no umi // Annotation annotate_tools = null // Only with --step annotate @@ -104,7 +107,6 @@ params { // Base specifications // Defaults only, expecting to be overwritten - cpus = 8 max_cpus = 16 max_memory = 128.GB max_time = 240.h From 216a15357f34406cd2cb7825cc984a7c96385928 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Sat, 25 Jul 2020 12:07:40 +0200 Subject: [PATCH 078/200] fix markduplicates and add baserecalibrator --- conf/modules.config | 11 ++-- main.nf | 56 ++++++++++--------- modules/local/process/base_recalibration.nf | 40 ------------- .../nf-core/software/gatk_baserecalibrator.nf | 40 +++++++++++++ .../nf-core/software/gatk_markduplicates.nf | 4 +- modules/nf-core/software/multiqc.nf | 8 ++- modules/nf-core/subworkflow/qc_trim.nf | 2 +- nextflow.config | 1 + 8 files changed, 84 insertions(+), 78 deletions(-) delete mode 100644 modules/local/process/base_recalibration.nf create mode 100644 modules/nf-core/software/gatk_baserecalibrator.nf diff --git a/conf/modules.config b/conf/modules.config index 80daaf6af8..3e3d051022 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -19,16 +19,17 @@ params { publish_results = "all" } 'bwamem2_index' { - args = "-a bwtsw" + args = "" suffix = "" publish_dir = "genome/bwa_index" publish_results = "all" } 'bwamem2_mem' { - args = "-M" - args2 = "-h -F 0x0100 -O BAM" - suffix = ".Lb" - publish_dir = "bwa/library" + args = "-K 100000000 -M" + args2 = "" + extra = "" + suffix = "" + publish_dir = "" publish_results = "all" } 'gatk_markduplicates' { diff --git a/main.nf b/main.nf index 38c9b4cd73..e463c644d1 100644 --- a/main.nf +++ b/main.nf @@ -261,7 +261,6 @@ include { BWAMEM2_MEM } from './modules/local/process/bwamem2_mem' include { GET_SOFTWARE_VERSIONS } from './modules/local/process/get_software_versions' include { OUTPUT_DOCUMENTATION } from './modules/local/process/output_documentation' include { MERGE_BAM_MAPPED } from './modules/local/process/merge_mapped_bam' -//include { BASE_RECALIBRATION } from './modules/local/base_recalibration' params(params) /* ================================================================================ @@ -277,8 +276,9 @@ include { BUILD_INDICES } from './modules/local/subworkflow/build_indices' ================================================================================ */ -include { GATK_MARKDUPLICATES as MARKDUPLICATES} from './modules/nf-core/software/gatk_markduplicates' -include { MULTIQC } from './modules/nf-core/software/multiqc' +include { GATK_MARKDUPLICATES as MARKDUPLICATES } from './modules/nf-core/software/gatk_markduplicates' +include { GATK_BASERECALIBRATOR as BASERECALIBRATOR } from './modules/nf-core/software/gatk_baserecalibrator' +include { MULTIQC } from './modules/nf-core/software/multiqc' /* ================================================================================ @@ -409,34 +409,36 @@ workflow { bam_mapped = bam_single.mix(MERGE_BAM_MAPPED(bam_multiple)) - markduplicates_report = Channel.empty() - markduplicates_bam = Channel.empty() + report_markduplicates = Channel.empty() + bam_markduplicates = bam_mapped if (!(params.skip_markduplicates)) { - // MARKDUPLICATES(bam_mapped) - // markduplicates_report = MARKDUPLICATES.out.report - // markduplicates_bam = MARKDUPLICATES.out.bam + MARKDUPLICATES(bam_mapped) + report_markduplicates = MARKDUPLICATES.out.report + bam_markduplicates = MARKDUPLICATES.out.bam } -// // bamBaseRecalibrator = markduplicates_bam.combine(BUILD_INDICES.out.intervals) - -// // //BASE_RECALIBRATION(bamBaseRecalibrator,dbsnp, dbsnp_index,fasta,) - -// OUTPUT_DOCUMENTATION( -// output_docs, -// output_docs_images) - -// GET_SOFTWARE_VERSIONS() - -// MULTIQC( -// fastqc_html.collect().ifEmpty([]), -// fastqc_zip.collect().ifEmpty([]), -// multiqc_config, -// multiqc_custom_config.ifEmpty([]), -// GET_SOFTWARE_VERSIONS.out.yml, -// trimgalore_report.collect().ifEmpty([]), -// markduplicates_report.collect().ifEmpty([]), -// workflow_summary) + bam_baserecalibrator = bam_markduplicates.combine(BUILD_INDICES.out.intervals) + + BASERECALIBRATOR(bam_baserecalibrator, dbsnp, dbsnp_tbi, dict, fai, fasta, known_indels, known_indels_tbi) + + OUTPUT_DOCUMENTATION( + output_docs, + output_docs_images) + + GET_SOFTWARE_VERSIONS() + + MULTIQC( + GET_SOFTWARE_VERSIONS.out.yml, + QC_TRIM.out.fastqc_html.collect().ifEmpty([]), + QC_TRIM.out.fastqc_zip.collect().ifEmpty([]), + QC_TRIM.out.trimgalore_html.collect().ifEmpty([]), + QC_TRIM.out.trimgalore_log.collect().ifEmpty([]), + QC_TRIM.out.trimgalore_zip.collect().ifEmpty([]), + multiqc_config, + multiqc_custom_config.ifEmpty([]), + report_markduplicates.collect().ifEmpty([]), + workflow_summary) } /* diff --git a/modules/local/process/base_recalibration.nf b/modules/local/process/base_recalibration.nf deleted file mode 100644 index 9eca053656..0000000000 --- a/modules/local/process/base_recalibration.nf +++ /dev/null @@ -1,40 +0,0 @@ -// process BASE_RECALIBRATION { -// label 'cpus_1' - -// tag "${idPatient}-${idSample}-${intervalBed.baseName}" - -// input: -// tuple idPatient, idSample, file(bam), file(bai), file(intervalBed) //from bamBaseRecalibrator -// path dbsnp //from dbsnp -// path dbsnpIndex// from dbsnp_tbi -// path fasta //from fasta -// path dict // from dict -// path fastaFai // from fai -// path knownIndels // from known_indels -// path knownIndelsIndex // from known_indels_tbi - -// output: -// tuple idPatient, idSample, file "${prefix}${idSample}.recal.table", emit: tableGatherBQSRReports -// tuple idPatient, idSample, emit: recalTableTSVnoInt - -// //when: params.known_indels - -// script: -// dbsnpOptions = params.dbsnp ? "--known-sites ${dbsnp}" : "" -// knownOptions = params.known_indels ? knownIndels.collect{"--known-sites ${it}"}.join(' ') : "" -// prefix = params.no_intervals ? "" : "${intervalBed.baseName}_" -// intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" -// // TODO: --use-original-qualities ??? -// """ -// gatk --java-options -Xmx${task.memory.toGiga()}g \ -// BaseRecalibrator \ -// -I ${bam} \ -// -O ${prefix}${idSample}.recal.table \ -// --tmp-dir . \ -// -R ${fasta} \ -// ${intervalsOptions} \ -// ${dbsnpOptions} \ -// ${knownOptions} \ -// --verbosity INFO -// """ -// } \ No newline at end of file diff --git a/modules/nf-core/software/gatk_baserecalibrator.nf b/modules/nf-core/software/gatk_baserecalibrator.nf new file mode 100644 index 0000000000..cfbec2c4ae --- /dev/null +++ b/modules/nf-core/software/gatk_baserecalibrator.nf @@ -0,0 +1,40 @@ +process GATK_BASERECALIBRATOR { + label 'cpus_1' + + tag "${meta.id}-${interval.baseName}" + + input: + tuple val(meta), path(bam), path(bai), path(interval) + path dbsnp + path dbsnp_tbi + path dict + path fai + path fasta + path known_indels + path known_indels_tbi + + output: + tuple val(meta), path("${prefix}${meta.sample}.recal.table"), emit: report + val meta, emit: tsv + + //when: params.known_indels + + script: + options_dbsnp = params.dbsnp ? "--known-sites ${dbsnp}" : "" + options_intervals = params.no_intervals ? "" : "-L ${interval}" + options_known_indels = params.known_indels ? known_indels.collect{"--known-sites ${it}"}.join(' ') : "" + prefix = params.no_intervals ? "" : "${interval.baseName}_" + // TODO: --use-original-qualities ??? + """ + gatk --java-options -Xmx${task.memory.toGiga()}g \ + BaseRecalibrator \ + -I ${bam} \ + -O ${prefix}${meta.sample}.recal.table \ + --tmp-dir . \ + -R ${fasta} \ + ${options_dbsnp} \ + ${options_known_indels} \ + ${options_intervals} \ + --verbosity INFO + """ +} \ No newline at end of file diff --git a/modules/nf-core/software/gatk_markduplicates.nf b/modules/nf-core/software/gatk_markduplicates.nf index ba7e99099d..813b1a9fa6 100644 --- a/modules/nf-core/software/gatk_markduplicates.nf +++ b/modules/nf-core/software/gatk_markduplicates.nf @@ -1,6 +1,6 @@ process GATK_MARKDUPLICATES { label 'cpus_16' - tag "${id}" + tag "${meta.id}" publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { @@ -13,7 +13,7 @@ process GATK_MARKDUPLICATES { output: tuple val(meta), path("${meta.sample}.md.bam"), path("${meta.sample}.md.bam.bai"), emit: bam - tuple val(meta), emit: tsv + val meta, emit: tsv path "${meta.sample}.bam.metrics", optional : true, emit: report script: diff --git a/modules/nf-core/software/multiqc.nf b/modules/nf-core/software/multiqc.nf index 4be4ed44ff..30689b9564 100644 --- a/modules/nf-core/software/multiqc.nf +++ b/modules/nf-core/software/multiqc.nf @@ -9,13 +9,15 @@ process MULTIQC { publishDir "${params.outdir}/multiqc", mode: params.publish_dir_mode input: + path software_versions path fastqc_html path fastqc_zip + path trim_galore_html + path trim_galore_log + path trim_galore_zip path multiqc_config path multiqc_custom_config - path software_versions - path trim_galore - path mark_duplicates + path report_markduplicates val workflow_summary output: diff --git a/modules/nf-core/subworkflow/qc_trim.nf b/modules/nf-core/subworkflow/qc_trim.nf index d9fedc1250..1c6b16b863 100644 --- a/modules/nf-core/subworkflow/qc_trim.nf +++ b/modules/nf-core/subworkflow/qc_trim.nf @@ -47,7 +47,7 @@ workflow QC_TRIM { fastqc_version // path: *.version.txt reads = trim_reads // channel: [ val(meta), [ reads ] ] trimgalore_html // path: *.html - trimgalore_zip // path: *.zip trimgalore_log // path: *.txt + trimgalore_zip // path: *.zip trimgalore_version // path: *.version.txt } diff --git a/nextflow.config b/nextflow.config index 3aa1351217..6193471aa4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -107,6 +107,7 @@ params { // Base specifications // Defaults only, expecting to be overwritten + cpus = 8 max_cpus = 16 max_memory = 128.GB max_time = 240.h From b24c9c651d1ba34169d2e1a4aac517c60179eb17 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Sat, 25 Jul 2020 16:49:49 +0200 Subject: [PATCH 079/200] fix bwamem2_mem process --- modules/local/process/bwamem2_mem.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/process/bwamem2_mem.nf b/modules/local/process/bwamem2_mem.nf index 2a1a7253ce..c3e9d99609 100644 --- a/modules/local/process/bwamem2_mem.nf +++ b/modules/local/process/bwamem2_mem.nf @@ -24,7 +24,7 @@ process BWAMEM2_MEM { extra = meta.status == 1 ? "-B 3" : "" """ bwa-mem2 mem \ - ${options.args_bwamem2} \ + ${options.args} \ -R \"${readGroup}\" \ ${extra} \ -t ${task.cpus} \ From 496b745edc442188f72b2d713e7c29eee2089b0b Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Sat, 25 Jul 2020 16:54:52 +0200 Subject: [PATCH 080/200] add MERGE_BAM_RECAL --- main.nf | 9 +++++---- .../local/process/{merge_mapped_bam.nf => merge_bam.nf} | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) rename modules/local/process/{merge_mapped_bam.nf => merge_bam.nf} (92%) diff --git a/main.nf b/main.nf index e463c644d1..0e0c240760 100644 --- a/main.nf +++ b/main.nf @@ -257,10 +257,11 @@ if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works ================================================================================ */ -include { BWAMEM2_MEM } from './modules/local/process/bwamem2_mem' -include { GET_SOFTWARE_VERSIONS } from './modules/local/process/get_software_versions' -include { OUTPUT_DOCUMENTATION } from './modules/local/process/output_documentation' -include { MERGE_BAM_MAPPED } from './modules/local/process/merge_mapped_bam' +include { BWAMEM2_MEM } from './modules/local/process/bwamem2_mem' +include { GET_SOFTWARE_VERSIONS } from './modules/local/process/get_software_versions' +include { OUTPUT_DOCUMENTATION } from './modules/local/process/output_documentation' +include { MERGE_BAM as MERGE_BAM_MAPPED; + MERGE_BAM as MERGE_BAM_RECAL;} from './modules/local/process/merge_bam' /* ================================================================================ diff --git a/modules/local/process/merge_mapped_bam.nf b/modules/local/process/merge_bam.nf similarity index 92% rename from modules/local/process/merge_mapped_bam.nf rename to modules/local/process/merge_bam.nf index 409993f068..076393d825 100644 --- a/modules/local/process/merge_mapped_bam.nf +++ b/modules/local/process/merge_bam.nf @@ -1,4 +1,4 @@ -process MERGE_BAM_MAPPED { +process MERGE_BAM { label 'cpus_8' tag "${meta.id}" From 6f3c37f60f473e10d977375f21a41fca63133927 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Sat, 25 Jul 2020 17:21:33 +0200 Subject: [PATCH 081/200] add gatherbqsrreports --- main.nf | 36 ++++++++++++++++--- .../nf-core/software/gatk_baserecalibrator.nf | 2 +- .../software/gatk_gatherbqsrreports.nf | 28 +++++++++++++++ 3 files changed, 60 insertions(+), 6 deletions(-) create mode 100644 modules/nf-core/software/gatk_gatherbqsrreports.nf diff --git a/main.nf b/main.nf index 0e0c240760..ffd96fa88d 100644 --- a/main.nf +++ b/main.nf @@ -277,9 +277,10 @@ include { BUILD_INDICES } from './modules/local/subworkflow/build_indices' ================================================================================ */ -include { GATK_MARKDUPLICATES as MARKDUPLICATES } from './modules/nf-core/software/gatk_markduplicates' -include { GATK_BASERECALIBRATOR as BASERECALIBRATOR } from './modules/nf-core/software/gatk_baserecalibrator' -include { MULTIQC } from './modules/nf-core/software/multiqc' +include { GATK_BASERECALIBRATOR as BASERECALIBRATOR } from './modules/nf-core/software/gatk_baserecalibrator' +include { GATK_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from './modules/nf-core/software/gatk_gatherbqsrreports' +include { GATK_MARKDUPLICATES as MARKDUPLICATES } from './modules/nf-core/software/gatk_markduplicates' +include { MULTIQC } from './modules/nf-core/software/multiqc' /* ================================================================================ @@ -368,7 +369,7 @@ workflow { BWAMEM2_MEM(QC_TRIM.out.reads, bwa, fasta, fai, params.modules['bwamem2_mem']) - results = BWAMEM2_MEM.out.map{ meta, bam, bai -> + BWAMEM2_MEM.out.map{ meta, bam, bai -> patient = meta.patient sample = meta.sample gender = meta.gender @@ -378,7 +379,7 @@ workflow { .branch{ single: it[4].size() == 1 multiple: it[4].size() > 1 - }.set { bam } + }.set{ bam } bam_single = bam.single.map { patient, sample, gender, status, bam, bai -> @@ -423,6 +424,31 @@ workflow { BASERECALIBRATOR(bam_baserecalibrator, dbsnp, dbsnp_tbi, dict, fai, fasta, known_indels, known_indels_tbi) + if (!params.no_intervals) { + BASERECALIBRATOR.out.report.map{ meta, table -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + [patient, sample, gender, status, table] + }.groupTuple(by: [0,1]).set{ recaltable } + + recaltable = recaltable.map { + patient, sample, gender, status, recal -> + + def meta = [:] + meta.patient = patient + meta.sample = sample + meta.gender = gender[0] + meta.status = status[0] + meta.id = sample + + [meta, recal] + } + + GATHERBQSRREPORTS(recaltable) + } + OUTPUT_DOCUMENTATION( output_docs, output_docs_images) diff --git a/modules/nf-core/software/gatk_baserecalibrator.nf b/modules/nf-core/software/gatk_baserecalibrator.nf index cfbec2c4ae..58452201e4 100644 --- a/modules/nf-core/software/gatk_baserecalibrator.nf +++ b/modules/nf-core/software/gatk_baserecalibrator.nf @@ -15,7 +15,7 @@ process GATK_BASERECALIBRATOR { output: tuple val(meta), path("${prefix}${meta.sample}.recal.table"), emit: report - val meta, emit: tsv + val meta, emit: tsv //when: params.known_indels diff --git a/modules/nf-core/software/gatk_gatherbqsrreports.nf b/modules/nf-core/software/gatk_gatherbqsrreports.nf new file mode 100644 index 0000000000..c33403e5fe --- /dev/null +++ b/modules/nf-core/software/gatk_gatherbqsrreports.nf @@ -0,0 +1,28 @@ +process GATK_GATHERBQSRREPORTS { + label 'memory_singleCPU_2_task' + label 'cpus_2' + tag "${meta.id}" + + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { + if (it == "${meta.sample}.recal.table" && !params.skip_markduplicates) "Preprocessing/${meta.sample}/DuplicatesMarked/${it}" + else "Preprocessing/${meta.sample}/Mapped/${it}" + } + + input: + tuple val(meta), path(recal) + + output: + tuple val(meta), path("${meta.sample}.recal.table"), emit: table + path "${meta.sample}.recal.table", emit: report + val meta, emit: tsv + + script: + input = recal.collect{"-I ${it}"}.join(' ') + """ + gatk --java-options -Xmx${task.memory.toGiga()}g \ + GatherBQSRReports \ + ${input} \ + -O ${meta.sample}.recal.table \ + """ +} \ No newline at end of file From 70dcb1bd1a2ebd2c134f4c773d1305008f01f59d Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Thu, 30 Jul 2020 09:27:45 +0200 Subject: [PATCH 082/200] Add ApplyBQSR module --- modules/nf-core/software/gatk_applybqsr.nf | 28 ++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 modules/nf-core/software/gatk_applybqsr.nf diff --git a/modules/nf-core/software/gatk_applybqsr.nf b/modules/nf-core/software/gatk_applybqsr.nf new file mode 100644 index 0000000000..0bf1b147fa --- /dev/null +++ b/modules/nf-core/software/gatk_applybqsr.nf @@ -0,0 +1,28 @@ +process GATK_APPLYBQSR { + label 'memory_singleCPU_2_task' + label 'cpus_2' + + tag "${meta.id}-${interval.baseName}" + + input: + tuple val(meta) path(bam), path(bai), path(recalibrationReport), file(intervalBed) from bamApplyBQSR + path dict + path fasta + path fai + + output: + tuple val(meta), path("${prefix}${idSample}.recal.bam") + + script: + prefix = params.no_intervals ? "" : "${interval.baseName}_" + options_intervals = params.no_intervals ? "" : "-L ${interval}" + """ + gatk --java-options -Xmx${task.memory.toGiga()}g \ + ApplyBQSR \ + -R ${fasta} \ + --input ${bam} \ + --output ${prefix}${meta.sample}.recal.bam \ + ${options_intervals} \ + --bqsr-recal-file ${recalibrationReport} + """ +} \ No newline at end of file From 4fcb25a7205540977f4c78b0428a5ec8fecd1280 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Thu, 30 Jul 2020 11:58:38 +0200 Subject: [PATCH 083/200] Add ApplyBQSR process in WF --- main.nf | 65 ++++++++++++++++++++++ modules/nf-core/software/gatk_applybqsr.nf | 4 +- 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index ffd96fa88d..0b9962ef09 100644 --- a/main.nf +++ b/main.nf @@ -280,6 +280,7 @@ include { BUILD_INDICES } from './modules/local/subworkflow/build_indices' include { GATK_BASERECALIBRATOR as BASERECALIBRATOR } from './modules/nf-core/software/gatk_baserecalibrator' include { GATK_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from './modules/nf-core/software/gatk_gatherbqsrreports' include { GATK_MARKDUPLICATES as MARKDUPLICATES } from './modules/nf-core/software/gatk_markduplicates' +include { GATK_APPLYBQSR as APPLYBQSR } from './modules/nf-core/software/gatk_applybqsr' include { MULTIQC } from './modules/nf-core/software/multiqc' /* @@ -447,8 +448,72 @@ workflow { } GATHERBQSRREPORTS(recaltable) + // if ('baserecalibrator' in skip_qc) baseRecalibratorReport.close() } + // (recalTableTSV, recalTableSampleTSV) = recalTableTSV.mix(recalTableTSVnoInt).into(2) + +// // Create TSV files to restart from this step +// if (params.skip_markduplicates) { +// recalTableTSV.map { idPatient, idSample -> +// status = status_map[idPatient, idSample] +// gender = gender_map[idPatient] +// bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" +// bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" +// recalTable = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.recal.table" +// "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n" +// }.collectFile( +// name: 'mapped_no_duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" +// ) + +// recalTableSampleTSV +// .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV/") { +// idPatient, idSample -> +// status = status_map[idPatient, idSample] +// gender = gender_map[idPatient] +// bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" +// bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" +// recalTable = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.recal.table" +// ["mapped_no_duplicates_marked_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n"] +// } +// } else { +// recalTableTSV.map { idPatient, idSample -> +// status = status_map[idPatient, idSample] +// gender = gender_map[idPatient] +// bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" +// bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" +// recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.recal.table" + +// "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n" +// }.collectFile( +// name: 'duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" +// ) + +// recalTableSampleTSV +// .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV/") { +// idPatient, idSample -> +// status = status_map[idPatient, idSample] +// gender = gender_map[idPatient] +// bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" +// bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" +// recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.recal.table" +// ["duplicates_marked_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n"] +// } + +//} + bam_applybqsr = MARKDUPLICATES.out.bam.join(GATHERBQSRREPORTS.out.table, by:[0]) + + //if (step == 'recalibrate') bamApplyBQSR = input_sample + + bam_applybqsr = bam_applybqsr.combine(BUILD_INDICES.out.intervals) + + APPLYBQSR(bam_applybqsr, dict, fasta, fai) + + + + + + OUTPUT_DOCUMENTATION( output_docs, output_docs_images) diff --git a/modules/nf-core/software/gatk_applybqsr.nf b/modules/nf-core/software/gatk_applybqsr.nf index 0bf1b147fa..cd267e1e78 100644 --- a/modules/nf-core/software/gatk_applybqsr.nf +++ b/modules/nf-core/software/gatk_applybqsr.nf @@ -5,13 +5,13 @@ process GATK_APPLYBQSR { tag "${meta.id}-${interval.baseName}" input: - tuple val(meta) path(bam), path(bai), path(recalibrationReport), file(intervalBed) from bamApplyBQSR + tuple val(meta), path(bam), path(bai), path(recalibrationReport), file(interval) path dict path fasta path fai output: - tuple val(meta), path("${prefix}${idSample}.recal.bam") + tuple val(meta), path("${prefix}${meta.sample}.recal.bam") script: prefix = params.no_intervals ? "" : "${interval.baseName}_" From 7c5f5b51c0e1edd7223030c073f1b634a2f67579 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Fri, 31 Jul 2020 13:58:32 +0200 Subject: [PATCH 084/200] Make samtools index separate process and sort with merge_bam --- main.nf | 42 +++++++++++----------- modules/local/process/bwamem2_mem.nf | 4 +-- modules/local/process/merge_bam.nf | 8 +++-- modules/nf-core/software/samtools_index.nf | 19 ++++++++++ 4 files changed, 47 insertions(+), 26 deletions(-) create mode 100644 modules/nf-core/software/samtools_index.nf diff --git a/main.nf b/main.nf index 0b9962ef09..2a27d4cbc4 100644 --- a/main.nf +++ b/main.nf @@ -277,11 +277,13 @@ include { BUILD_INDICES } from './modules/local/subworkflow/build_indices' ================================================================================ */ -include { GATK_BASERECALIBRATOR as BASERECALIBRATOR } from './modules/nf-core/software/gatk_baserecalibrator' -include { GATK_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from './modules/nf-core/software/gatk_gatherbqsrreports' -include { GATK_MARKDUPLICATES as MARKDUPLICATES } from './modules/nf-core/software/gatk_markduplicates' -include { GATK_APPLYBQSR as APPLYBQSR } from './modules/nf-core/software/gatk_applybqsr' -include { MULTIQC } from './modules/nf-core/software/multiqc' +include { GATK_BASERECALIBRATOR as BASERECALIBRATOR } from './modules/nf-core/software/gatk_baserecalibrator' +include { GATK_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from './modules/nf-core/software/gatk_gatherbqsrreports' +include { GATK_MARKDUPLICATES as MARKDUPLICATES } from './modules/nf-core/software/gatk_markduplicates' +include { GATK_APPLYBQSR as APPLYBQSR } from './modules/nf-core/software/gatk_applybqsr' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MAPPED } from './modules/nf-core/software/samtools_index' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_RECAL } from './modules/nf-core/software/samtools_index' +include { MULTIQC } from './modules/nf-core/software/multiqc' /* ================================================================================ @@ -370,12 +372,12 @@ workflow { BWAMEM2_MEM(QC_TRIM.out.reads, bwa, fasta, fai, params.modules['bwamem2_mem']) - BWAMEM2_MEM.out.map{ meta, bam, bai -> + BWAMEM2_MEM.out.map{ meta, bam -> //, bai -> patient = meta.patient sample = meta.sample gender = meta.gender status = meta.status - [patient, sample, gender, status, bam, bai] + [patient, sample, gender, status, bam] //, bai] }.groupTuple(by: [0,1]) .branch{ single: it[4].size() == 1 @@ -383,7 +385,7 @@ workflow { }.set{ bam } bam_single = bam.single.map { - patient, sample, gender, status, bam, bai -> + patient, sample, gender, status, bam -> //, bai -> def meta = [:] meta.patient = patient @@ -392,11 +394,11 @@ workflow { meta.status = status[0] meta.id = sample - [meta, bam[0], bai[0]] + [meta, bam[0]] // , bai[0]] } bam_multiple = bam.multiple.map { - patient, sample, gender, status, bam, bai -> + patient, sample, gender, status, bam -> //, bai -> def meta = [:] meta.patient = patient @@ -405,12 +407,12 @@ workflow { meta.status = status[0] meta.id = sample - [meta, bam, bai] + [meta, bam] } // multipleBam = multipleBam.mix(multipleBamSentieon) - bam_mapped = bam_single.mix(MERGE_BAM_MAPPED(bam_multiple)) + bam_mapped = bam_single.mix(SAMTOOLS_INDEX_MAPPED(MERGE_BAM_MAPPED(bam_multiple))) report_markduplicates = Channel.empty() bam_markduplicates = bam_mapped @@ -452,7 +454,6 @@ workflow { } // (recalTableTSV, recalTableSampleTSV) = recalTableTSV.mix(recalTableTSVnoInt).into(2) - // // Create TSV files to restart from this step // if (params.skip_markduplicates) { // recalTableTSV.map { idPatient, idSample -> @@ -501,18 +502,17 @@ workflow { // } //} - bam_applybqsr = MARKDUPLICATES.out.bam.join(GATHERBQSRREPORTS.out.table, by:[0]) - - //if (step == 'recalibrate') bamApplyBQSR = input_sample - + bam_applybqsr = MARKDUPLICATES.out.bam.join(GATHERBQSRREPORTS.out.table) //by:[0] bam_applybqsr = bam_applybqsr.combine(BUILD_INDICES.out.intervals) - APPLYBQSR(bam_applybqsr, dict, fasta, fai) - - - +// if (step == 'recalibrate') bamApplyBQSR = input_sample + APPLYBQSR(bam_applybqsr, dict, fasta, fai) + APPLYBQSR.out.dump() + // (bam_recalibrated_to_merge, bam_recalibrated_to_index) = bam_recalibrated_to_merge.groupTuple(by:[0, 1]).into(2) + MERGE_BAM_RECAL(APPLYBQSR.out) + SAMTOOLS_INDEX_RECAL(MERGE_BAM_RECAL.out) OUTPUT_DOCUMENTATION( output_docs, diff --git a/modules/local/process/bwamem2_mem.nf b/modules/local/process/bwamem2_mem.nf index c3e9d99609..7aa503c00c 100644 --- a/modules/local/process/bwamem2_mem.nf +++ b/modules/local/process/bwamem2_mem.nf @@ -16,7 +16,7 @@ process BWAMEM2_MEM { val options output: - tuple val(meta), path("*.bam"), path("*.bai") + tuple val(meta), path("*.bam")//, path("*.bai") script: CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : "" @@ -31,7 +31,7 @@ process BWAMEM2_MEM { ${fasta} ${reads} | \ samtools sort --threads ${task.cpus} -m 2G - > ${meta.id}.bam - samtools index ${meta.id}.bam + # samtools index ${meta.id}.bam echo \$(bwa-mem2 version 2>&1) > bwa-mem2.version.txt """ diff --git a/modules/local/process/merge_bam.nf b/modules/local/process/merge_bam.nf index 076393d825..363e4e7c7b 100644 --- a/modules/local/process/merge_bam.nf +++ b/modules/local/process/merge_bam.nf @@ -4,14 +4,16 @@ process MERGE_BAM { tag "${meta.id}" input: - tuple val(meta), path(bam), path(bai) + tuple val(meta), path(bam)//, path(bai) optional: true output: - tuple val(meta), path("${meta.sample}.bam"), path("${meta.sample}.bam.bai") + tuple val(meta), path("${meta.sample}.bam")//, path("${meta.sample}.bam.bai") optional: true + // when: !(params.no_intervals) +// samtools merge --threads ${task.cpus} ${idSample}.bam ${bam} +// samtools merge --threads ${task.cpus} ${idSample}.recal.bam ${bam} script: """ samtools merge --threads ${task.cpus} ${meta.sample}.bam ${bam} - samtools index ${meta.sample}.bam """ } \ No newline at end of file diff --git a/modules/nf-core/software/samtools_index.nf b/modules/nf-core/software/samtools_index.nf new file mode 100644 index 0000000000..8df490a6a7 --- /dev/null +++ b/modules/nf-core/software/samtools_index.nf @@ -0,0 +1,19 @@ +process SAMTOOLS_INDEX { + label 'cpus_8' + + tag "${meta.id}" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path(bam), path("*.bai") +// samtools index ${idSample}.bam + + // samtools index ${idSample}.recal.bam + + script: + """ + samtools index $bam + """ +} \ No newline at end of file From 47636a8f56bbedc60dbb84dfad6acf20089fc35c Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Fri, 31 Jul 2020 14:40:54 +0200 Subject: [PATCH 085/200] Add qualimap and samtools stats, remove useless comments --- modules/local/process/bwamem2_mem.nf | 2 +- modules/local/process/merge_bam.nf | 7 ++--- modules/nf-core/software/qualimap_bamqc.nf | 33 ++++++++++++++++++++++ modules/nf-core/software/samtools_index.nf | 3 -- modules/nf-core/software/samtools_stats.nf | 20 +++++++++++++ 5 files changed, 57 insertions(+), 8 deletions(-) create mode 100644 modules/nf-core/software/qualimap_bamqc.nf create mode 100644 modules/nf-core/software/samtools_stats.nf diff --git a/modules/local/process/bwamem2_mem.nf b/modules/local/process/bwamem2_mem.nf index 7aa503c00c..920ec8daf5 100644 --- a/modules/local/process/bwamem2_mem.nf +++ b/modules/local/process/bwamem2_mem.nf @@ -16,7 +16,7 @@ process BWAMEM2_MEM { val options output: - tuple val(meta), path("*.bam")//, path("*.bai") + tuple val(meta), path("*.bam") script: CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : "" diff --git a/modules/local/process/merge_bam.nf b/modules/local/process/merge_bam.nf index 363e4e7c7b..88364a9097 100644 --- a/modules/local/process/merge_bam.nf +++ b/modules/local/process/merge_bam.nf @@ -4,14 +4,13 @@ process MERGE_BAM { tag "${meta.id}" input: - tuple val(meta), path(bam)//, path(bai) optional: true + tuple val(meta), path(bam) output: - tuple val(meta), path("${meta.sample}.bam")//, path("${meta.sample}.bam.bai") optional: true + tuple val(meta), path("${meta.sample}.bam") // when: !(params.no_intervals) -// samtools merge --threads ${task.cpus} ${idSample}.bam ${bam} -// samtools merge --threads ${task.cpus} ${idSample}.recal.bam ${bam} + script: """ samtools merge --threads ${task.cpus} ${meta.sample}.bam ${bam} diff --git a/modules/nf-core/software/qualimap_bamqc.nf b/modules/nf-core/software/qualimap_bamqc.nf new file mode 100644 index 0000000000..dc2ea3a428 --- /dev/null +++ b/modules/nf-core/software/qualimap_bamqc.nf @@ -0,0 +1,33 @@ +process QUALIMAP_BAMQC { + label 'memory_max' + label 'cpus_16' + + tag "${meta.id}" + + publishDir "${params.outdir}/Reports/${meta.id}/bamQC", mode: params.publish_dir_mode + + input: + tuple val(meta), path(bam) + path(targetBED) + + output: + file("${bam.baseName}") + + //when: !('bamqc' in skip_qc) + + script: + use_bed = params.target_bed ? "-gff ${targetBED}" : '' + """ + qualimap --java-mem-size=${task.memory.toGiga()}G \ + bamqc \ + -bam ${bam} \ + --paint-chromosome-limits \ + --genome-gc-distr HUMAN \ + $use_bed \ + -nt ${task.cpus} \ + -skip-duplicated \ + --skip-dup-mode 0 \ + -outdir ${bam.baseName} \ + -outformat HTML + """ +} diff --git a/modules/nf-core/software/samtools_index.nf b/modules/nf-core/software/samtools_index.nf index 8df490a6a7..daa14c099a 100644 --- a/modules/nf-core/software/samtools_index.nf +++ b/modules/nf-core/software/samtools_index.nf @@ -8,9 +8,6 @@ process SAMTOOLS_INDEX { output: tuple val(meta), path(bam), path("*.bai") -// samtools index ${idSample}.bam - - // samtools index ${idSample}.recal.bam script: """ diff --git a/modules/nf-core/software/samtools_stats.nf b/modules/nf-core/software/samtools_stats.nf new file mode 100644 index 0000000000..857ff43be7 --- /dev/null +++ b/modules/nf-core/software/samtools_stats.nf @@ -0,0 +1,20 @@ +process SAMTOOLS_STATS { + label 'cpus_2' + + tag "${meta.id}" + + publishDir "${params.outdir}/Reports/${meta.id}/SamToolsStats", mode: params.publish_dir_mode + + input: + tuple val(meta), path(bam) + + output: + path ("${bam}.samtools.stats.out") + + //when: !('samtools' in skip_qc) + + script: + """ + samtools stats ${bam} > ${bam}.samtools.stats.out + """ +} \ No newline at end of file From a876ba5139f503f4fef1fffe54b1b15bd7314bd3 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Fri, 31 Jul 2020 15:42:34 +0200 Subject: [PATCH 086/200] Add qc to workflow --- main.nf | 179 ++++----------------- modules/nf-core/software/qualimap_bamqc.nf | 2 +- 2 files changed, 29 insertions(+), 152 deletions(-) diff --git a/main.nf b/main.nf index 2a27d4cbc4..6fe3368722 100644 --- a/main.nf +++ b/main.nf @@ -283,6 +283,8 @@ include { GATK_MARKDUPLICATES as MARKDUPLICATES } from './modules/nf-c include { GATK_APPLYBQSR as APPLYBQSR } from './modules/nf-core/software/gatk_applybqsr' include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MAPPED } from './modules/nf-core/software/samtools_index' include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_RECAL } from './modules/nf-core/software/samtools_index' +include { SAMTOOLS_STATS as SAMTOOLS_STATS } from './modules/nf-core/software/samtools_stats' +include { QUALIMAP_BAMQC as BAMQC } from './modules/nf-core/software/qualimap_bamqc' include { MULTIQC } from './modules/nf-core/software/multiqc' /* @@ -361,6 +363,7 @@ workflow { pon_tbi = params.pon ? params.pon_index ?: BUILD_INDICES.out.pon_tbi : Channel.empty() // PREPROCESSING + // STEP 0.5: QC ON READS QC_TRIM( input_sample, @@ -370,6 +373,8 @@ workflow { params.modules['trimgalore'] ) + // STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA MEM + BWAMEM2_MEM(QC_TRIM.out.reads, bwa, fasta, fai, params.modules['bwamem2_mem']) BWAMEM2_MEM.out.map{ meta, bam -> //, bai -> @@ -412,21 +417,27 @@ workflow { // multipleBam = multipleBam.mix(multipleBamSentieon) - bam_mapped = bam_single.mix(SAMTOOLS_INDEX_MAPPED(MERGE_BAM_MAPPED(bam_multiple))) + // STEP 1.5: MERGING BAM FROM MULTIPLE LANES + + bam_mapped = bam_single.mix(SAMTOOLS_INDEX_MAPPED(MERGE_BAM_MAPPED(bam_multiple))) //for samtools_index_mapped when: save_bam_mapped || !(params.known_indels) report_markduplicates = Channel.empty() bam_markduplicates = bam_mapped + // STEP 2: MARKING DUPLICATES if (!(params.skip_markduplicates)) { MARKDUPLICATES(bam_mapped) report_markduplicates = MARKDUPLICATES.out.report bam_markduplicates = MARKDUPLICATES.out.bam } - bam_baserecalibrator = bam_markduplicates.combine(BUILD_INDICES.out.intervals) + // STEP 3: CREATING RECALIBRATION TABLES + bam_baserecalibrator = bam_markduplicates.combine(BUILD_INDICES.out.intervals) BASERECALIBRATOR(bam_baserecalibrator, dbsnp, dbsnp_tbi, dict, fai, fasta, known_indels, known_indels_tbi) + // STEP 3.5: MERGING RECALIBRATION TABLES + if (!params.no_intervals) { BASERECALIBRATOR.out.report.map{ meta, table -> patient = meta.patient @@ -453,67 +464,28 @@ workflow { // if ('baserecalibrator' in skip_qc) baseRecalibratorReport.close() } - // (recalTableTSV, recalTableSampleTSV) = recalTableTSV.mix(recalTableTSVnoInt).into(2) -// // Create TSV files to restart from this step -// if (params.skip_markduplicates) { -// recalTableTSV.map { idPatient, idSample -> -// status = status_map[idPatient, idSample] -// gender = gender_map[idPatient] -// bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" -// bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" -// recalTable = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.recal.table" -// "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n" -// }.collectFile( -// name: 'mapped_no_duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" -// ) - -// recalTableSampleTSV -// .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV/") { -// idPatient, idSample -> -// status = status_map[idPatient, idSample] -// gender = gender_map[idPatient] -// bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" -// bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" -// recalTable = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.recal.table" -// ["mapped_no_duplicates_marked_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n"] -// } -// } else { -// recalTableTSV.map { idPatient, idSample -> -// status = status_map[idPatient, idSample] -// gender = gender_map[idPatient] -// bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" -// bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" -// recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.recal.table" - -// "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n" -// }.collectFile( -// name: 'duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" -// ) + // STEP 4: RECALIBRATING -// recalTableSampleTSV -// .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV/") { -// idPatient, idSample -> -// status = status_map[idPatient, idSample] -// gender = gender_map[idPatient] -// bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" -// bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" -// recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.recal.table" -// ["duplicates_marked_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n"] -// } - -//} bam_applybqsr = MARKDUPLICATES.out.bam.join(GATHERBQSRREPORTS.out.table) //by:[0] bam_applybqsr = bam_applybqsr.combine(BUILD_INDICES.out.intervals) - -// if (step == 'recalibrate') bamApplyBQSR = input_sample - + // if (step == 'recalibrate') bamApplyBQSR = input_sample APPLYBQSR(bam_applybqsr, dict, fasta, fai) - APPLYBQSR.out.dump() - // (bam_recalibrated_to_merge, bam_recalibrated_to_index) = bam_recalibrated_to_merge.groupTuple(by:[0, 1]).into(2) + // STEP 4.5: MERGING AND INDEXING THE RECALIBRATED BAM FILES + MERGE_BAM_RECAL(APPLYBQSR.out) SAMTOOLS_INDEX_RECAL(MERGE_BAM_RECAL.out) + // STEP 5: QC + + SAMTOOLS_STATS(MERGE_BAM_RECAL.out) + bamqc = BWAMEM2_MEM.out.mix(MERGE_BAM_RECAL.out) + //bamqc.dump() + BAMQC(BWAMEM2_MEM.out, target_bed) + + + + OUTPUT_DOCUMENTATION( output_docs, output_docs_images) @@ -826,46 +798,8 @@ workflow.onComplete { // // STEP 3: CREATING RECALIBRATION TABLES -// process BaseRecalibrator { -// label 'cpus_1' +// process BaseRecalibrator -// tag "${idPatient}-${idSample}-${intervalBed.baseName}" - -// input: -// set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamBaseRecalibrator -// file(dbsnp) from dbsnp -// file(dbsnpIndex) from dbsnp_tbi -// file(fasta) from fasta -// file(dict) from dict -// file(fastaFai) from fai -// file(knownIndels) from known_indels -// file(knownIndelsIndex) from known_indels_tbi - -// output: -// set idPatient, idSample, file("${prefix}${idSample}.recal.table") into tableGatherBQSRReports -// set idPatient, idSample into recalTableTSVnoInt - -// when: params.known_indels - -// script: -// dbsnpOptions = params.dbsnp ? "--known-sites ${dbsnp}" : "" -// knownOptions = params.known_indels ? knownIndels.collect{"--known-sites ${it}"}.join(' ') : "" -// prefix = params.no_intervals ? "" : "${intervalBed.baseName}_" -// intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" -// // TODO: --use-original-qualities ??? -// """ -// gatk --java-options -Xmx${task.memory.toGiga()}g \ -// BaseRecalibrator \ -// -I ${bam} \ -// -O ${prefix}${idSample}.recal.table \ -// --tmp-dir . \ -// -R ${fasta} \ -// ${intervalsOptions} \ -// ${dbsnpOptions} \ -// ${knownOptions} \ -// --verbosity INFO -// """ -// } // if (!params.no_intervals) tableGatherBQSRReports = tableGatherBQSRReports.groupTuple(by:[0, 1]) @@ -878,37 +812,6 @@ workflow.onComplete { // // STEP 3.5: MERGING RECALIBRATION TABLES -// process GatherBQSRReports { -// label 'memory_singleCPU_2_task' -// label 'cpus_2' - -// tag "${idPatient}-${idSample}" - -// publishDir params.outdir, mode: params.publish_dir_mode, -// saveAs: { -// if (it == "${idSample}.recal.table" && !params.skip_markduplicates) "Preprocessing/${idSample}/DuplicatesMarked/${it}" -// else "Preprocessing/${idSample}/Mapped/${it}" -// } - -// input: -// set idPatient, idSample, file(recal) from tableGatherBQSRReports - -// output: -// set idPatient, idSample, file("${idSample}.recal.table") into recalTable -// file("${idSample}.recal.table") into baseRecalibratorReport -// set idPatient, idSample into recalTableTSV - -// when: !(params.no_intervals) - -// script: -// input = recal.collect{"-I ${it}"}.join(' ') -// """ -// gatk --java-options -Xmx${task.memory.toGiga()}g \ -// GatherBQSRReports \ -// ${input} \ -// -O ${idSample}.recal.table \ -// """ -// } // if ('baserecalibrator' in skip_qc) baseRecalibratorReport.close() @@ -977,33 +880,7 @@ workflow.onComplete { // // STEP 4: RECALIBRATING // process ApplyBQSR { -// label 'memory_singleCPU_2_task' -// label 'cpus_2' - -// tag "${idPatient}-${idSample}-${intervalBed.baseName}" - -// input: -// set idPatient, idSample, file(bam), file(bai), file(recalibrationReport), file(intervalBed) from bamApplyBQSR -// file(dict) from dict -// file(fasta) from fasta -// file(fastaFai) from fai -// output: -// set idPatient, idSample, file("${prefix}${idSample}.recal.bam") into bam_recalibrated_to_merge - -// script: -// prefix = params.no_intervals ? "" : "${intervalBed.baseName}_" -// intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" -// """ -// gatk --java-options -Xmx${task.memory.toGiga()}g \ -// ApplyBQSR \ -// -R ${fasta} \ -// --input ${bam} \ -// --output ${prefix}${idSample}.recal.bam \ -// ${intervalsOptions} \ -// --bqsr-recal-file ${recalibrationReport} -// """ -// } // (bam_recalibrated_to_merge, bam_recalibrated_to_index) = bam_recalibrated_to_merge.groupTuple(by:[0, 1]).into(2) diff --git a/modules/nf-core/software/qualimap_bamqc.nf b/modules/nf-core/software/qualimap_bamqc.nf index dc2ea3a428..ffb5afc57a 100644 --- a/modules/nf-core/software/qualimap_bamqc.nf +++ b/modules/nf-core/software/qualimap_bamqc.nf @@ -11,7 +11,7 @@ process QUALIMAP_BAMQC { path(targetBED) output: - file("${bam.baseName}") + path("${bam.baseName}") //when: !('bamqc' in skip_qc) From 9f08a519f3576642c5d1d9d9d6a5d8672c628f32 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Fri, 31 Jul 2020 16:23:03 +0200 Subject: [PATCH 087/200] Remove everything sentenion, we can look it up later in the released main.nf --- main.nf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 6fe3368722..c801887ada 100644 --- a/main.nf +++ b/main.nf @@ -479,9 +479,10 @@ workflow { // STEP 5: QC SAMTOOLS_STATS(MERGE_BAM_RECAL.out) - bamqc = BWAMEM2_MEM.out.mix(MERGE_BAM_RECAL.out) - //bamqc.dump() - BAMQC(BWAMEM2_MEM.out, target_bed) + bamqc = BWAMEM2_MEM.out//.mix(MERGE_BAM_RECAL.out) + bamqc.dump() + + BAMQC(bamqc, target_bed) From ed2fc4224243109c374965b0447ccff57c94abbd Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Fri, 31 Jul 2020 18:30:24 +0200 Subject: [PATCH 088/200] Some more small changes and removal of used code --- main.nf | 490 +++--------------- .../nf-core/software/gatk_markduplicates.nf | 2 +- modules/nf-core/software/qualimap_bamqc.nf | 48 +- modules/nf-core/software/samtools_index.nf | 8 + 4 files changed, 98 insertions(+), 450 deletions(-) diff --git a/main.nf b/main.nf index c801887ada..c9ca22f80c 100644 --- a/main.nf +++ b/main.nf @@ -362,9 +362,12 @@ workflow { known_indels_tbi = params.known_indels ? params.known_indels_index ?: BUILD_INDICES.out.known_indels_tbi.collect() : Channel.empty() pon_tbi = params.pon ? params.pon_index ?: BUILD_INDICES.out.pon_tbi : Channel.empty() - // PREPROCESSING + /* + ================================================================================ + PREPROCESSING + ================================================================================ + */ // STEP 0.5: QC ON READS - QC_TRIM( input_sample, ('fastqc' in skip_qc), @@ -374,9 +377,7 @@ workflow { ) // STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA MEM - BWAMEM2_MEM(QC_TRIM.out.reads, bwa, fasta, fai, params.modules['bwamem2_mem']) - BWAMEM2_MEM.out.map{ meta, bam -> //, bai -> patient = meta.patient sample = meta.sample @@ -415,29 +416,30 @@ workflow { [meta, bam] } - // multipleBam = multipleBam.mix(multipleBamSentieon) - - // STEP 1.5: MERGING BAM FROM MULTIPLE LANES - - bam_mapped = bam_single.mix(SAMTOOLS_INDEX_MAPPED(MERGE_BAM_MAPPED(bam_multiple))) //for samtools_index_mapped when: save_bam_mapped || !(params.known_indels) - + // STEP 1.5: MERGING AND INDEXING BAM FROM MULTIPLE LANES + + bam_mapped = bam_single.mix(MERGE_BAM_MAPPED(bam_multiple)) + //if(save_bam_mapped || !(params.known_indels)) + //TODO: https://github.com/nf-core/sarek/blob/bce378e09de25bb26c388b917f93f84806d3ba27/main.nf#L1478 + //But if SAMTOOLS_INDEX is not run, markduplicates does not work + bam_mapped = SAMTOOLS_INDEX_MAPPED(bam_mapped) + + // STEP 2: MARKING DUPLICATES report_markduplicates = Channel.empty() + bam_markduplicates = bam_mapped - - // STEP 2: MARKING DUPLICATES if (!(params.skip_markduplicates)) { - MARKDUPLICATES(bam_mapped) - report_markduplicates = MARKDUPLICATES.out.report - bam_markduplicates = MARKDUPLICATES.out.bam + bam_mapped.dump() + MARKDUPLICATES(bam_mapped) + report_markduplicates = MARKDUPLICATES.out.report + bam_markduplicates = MARKDUPLICATES.out.bam } // STEP 3: CREATING RECALIBRATION TABLES - bam_baserecalibrator = bam_markduplicates.combine(BUILD_INDICES.out.intervals) BASERECALIBRATOR(bam_baserecalibrator, dbsnp, dbsnp_tbi, dict, fai, fasta, known_indels, known_indels_tbi) // STEP 3.5: MERGING RECALIBRATION TABLES - if (!params.no_intervals) { BASERECALIBRATOR.out.report.map{ meta, table -> patient = meta.patient @@ -465,45 +467,68 @@ workflow { } // STEP 4: RECALIBRATING - bam_applybqsr = MARKDUPLICATES.out.bam.join(GATHERBQSRREPORTS.out.table) //by:[0] bam_applybqsr = bam_applybqsr.combine(BUILD_INDICES.out.intervals) // if (step == 'recalibrate') bamApplyBQSR = input_sample APPLYBQSR(bam_applybqsr, dict, fasta, fai) // STEP 4.5: MERGING AND INDEXING THE RECALIBRATED BAM FILES - - MERGE_BAM_RECAL(APPLYBQSR.out) - SAMTOOLS_INDEX_RECAL(MERGE_BAM_RECAL.out) + if (!(params.no_intervals)){ + MERGE_BAM_RECAL(APPLYBQSR.out) + SAMTOOLS_INDEX_RECAL(MERGE_BAM_RECAL.out) + }else{ + SAMTOOLS_INDEX_RECAL(APPLYBQSR.out) + } // STEP 5: QC - SAMTOOLS_STATS(MERGE_BAM_RECAL.out) - bamqc = BWAMEM2_MEM.out//.mix(MERGE_BAM_RECAL.out) + //TODO This should work but somehow BAMQC is not called + bamqc = BWAMEM2_MEM.out.mix(MERGE_BAM_RECAL.out) bamqc.dump() - BAMQC(bamqc, target_bed) + /* + ================================================================================ + GERMLINE VARIANT CALLING + ================================================================================ + */ + + /* + ================================================================================ + SOMATIC VARIANT CALLING + ================================================================================ + */ + + /* + ================================================================================ + ANNOTATION + ================================================================================ + */ + /* + ================================================================================ + MultiQC + ================================================================================ + */ OUTPUT_DOCUMENTATION( output_docs, output_docs_images) GET_SOFTWARE_VERSIONS() - MULTIQC( - GET_SOFTWARE_VERSIONS.out.yml, - QC_TRIM.out.fastqc_html.collect().ifEmpty([]), - QC_TRIM.out.fastqc_zip.collect().ifEmpty([]), - QC_TRIM.out.trimgalore_html.collect().ifEmpty([]), - QC_TRIM.out.trimgalore_log.collect().ifEmpty([]), - QC_TRIM.out.trimgalore_zip.collect().ifEmpty([]), - multiqc_config, - multiqc_custom_config.ifEmpty([]), - report_markduplicates.collect().ifEmpty([]), - workflow_summary) + // MULTIQC( + // GET_SOFTWARE_VERSIONS.out.yml, + // QC_TRIM.out.fastqc_html.collect().ifEmpty([]), + // QC_TRIM.out.fastqc_zip.collect().ifEmpty([]), + // QC_TRIM.out.trimgalore_html.collect().ifEmpty([]), + // QC_TRIM.out.trimgalore_log.collect().ifEmpty([]), + // QC_TRIM.out.trimgalore_zip.collect().ifEmpty([]), + // multiqc_config, + // multiqc_custom_config.ifEmpty([]), + // report_markduplicates.collect().ifEmpty([]), + // workflow_summary) } /* @@ -524,8 +549,6 @@ workflow.onComplete { // ================================================================================ // */ -// (intBaseRecalibrator, intApplyBQSR, intHaplotypeCaller, intFreebayesSingle, intMpileup, bedIntervals) = bedIntervals.into(6) - // // STEP 0.5: QC ON READS @@ -557,134 +580,18 @@ workflow.onComplete { // fastQCReport = fastQCFQReport.mix(fastQCBAMReport) -// if (!params.trim_fastq) input_pair_readstrimgalore.close() // // STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA MEM -// if (params.trim_fastq) input_pair_reads = outputPairReadsTrimGalore -// else input_pair_reads = input_pair_reads.mix(input_bam) - -// input_pair_reads = input_pair_reads.dump(tag:'INPUT') - -// (input_pair_reads, input_pair_reads_sentieon) = input_pair_reads.into(2) -// if (params.sentieon) input_pair_reads.close() -// else input_pair_reads_sentieon.close() - - -// // STEP 1': MAPPING READS TO REFERENCE GENOME WITH SENTIEON BWA MEM - -// process Sentieon_MapReads { -// label 'cpus_max' -// label 'memory_max' -// label 'sentieon' - -// tag "${idPatient}-${idRun}" - -// input: -// set idPatient, idSample, idRun, file(inputFile1), file(inputFile2) from input_pair_reads_sentieon -// file(bwaIndex) from bwa -// file(fasta) from fasta -// file(fastaFai) from fai - -// output: -// set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") into bam_sentieon_mapped - -// when: params.sentieon - -// script: -// // -K is an hidden option, used to fix the number of reads processed by bwa mem -// // Chunk size can affect bwa results, if not specified, -// // the number of threads can change which can give not deterministic result. -// // cf https://github.com/CCDG/Pipeline-Standardization/blob/master/PipelineStandard.md -// // and https://github.com/gatk-workflows/gatk4-data-processing/blob/8ffa26ff4580df4ac3a5aa9e272a4ff6bab44ba2/processing-for-variant-discovery-gatk4.b37.wgs.inputs.json#L29 -// CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : "" -// readGroup = "@RG\\tID:${idRun}\\t${CN}PU:${idRun}\\tSM:${idSample}\\tLB:${idSample}\\tPL:illumina" -// // adjust mismatch penalty for tumor samples -// status = status_map[idPatient, idSample] -// extra = status == 1 ? "-B 3" : "" -// """ -// sentieon bwa mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${fasta} \ -// ${inputFile1} ${inputFile2} | \ -// sentieon util sort -r ${fasta} -o ${idSample}_${idRun}.bam -t ${task.cpus} --sam2bam -i - -// """ -// } - -// bam_sentieon_mapped = bam_sentieon_mapped.dump(tag:'Sentieon Mapped BAM') -// // Sort BAM whether they are standalone or should be merged - -// singleBamSentieon = Channel.create() -// multipleBamSentieon = Channel.create() -// bam_sentieon_mapped.groupTuple(by:[0, 1]) -// .choice(singleBamSentieon, multipleBamSentieon) {it[2].size() > 1 ? 1 : 0} -// singleBamSentieon = singleBamSentieon.map { -// idPatient, idSample, idRun, bam -> -// [idPatient, idSample, bam] -// } -// singleBamSentieon = singleBamSentieon.dump(tag:'Single BAM') +//TODO: needs to be covered when inputBam is supported +// if (params.trim_fastq) input_pair_reads = outputPairReadsTrimGalore //this is covered +// else input_pair_reads = input_pair_reads.mix(input_bam) // // STEP 1.5: MERGING BAM FROM MULTIPLE LANES - -// bam_mapped_merged = bam_mapped_merged.mix(singleBam,singleBamSentieon) - -// (bam_mapped_merged, bam_sentieon_mapped_merged) = bam_mapped_merged.into(2) - -// if (!params.sentieon) bam_sentieon_mapped_merged.close() -// else bam_mapped_merged.close() - -// bam_mapped_merged = bam_mapped_merged.dump(tag:'BAMs for MD') -// bam_sentieon_mapped_merged = bam_sentieon_mapped_merged.dump(tag:'Sentieon BAMs to Index') - -// process IndexBamMergedForSentieon { -// label 'cpus_8' - -// tag "${idPatient}-${idSample}" - -// input: -// set idPatient, idSample, file("${idSample}.bam") from bam_sentieon_mapped_merged - -// output: -// set idPatient, idSample, file("${idSample}.bam"), file("${idSample}.bam.bai") into bam_sentieon_mapped_merged_indexed - -// script: -// """ -// samtools index ${idSample}.bam -// """ -// } - -// (bam_mapped_merged, bam_mapped_merged_to_index) = bam_mapped_merged.into(2) - -//@Maxime: You included this process in merged_bam.nf, right? -// process IndexBamFile { -// label 'cpus_8' - -// tag "${idPatient}-${idSample}" - -// publishDir params.outdir, mode: params.publish_dir_mode, -// saveAs: { -// if (save_bam_mapped) "Preprocessing/${idSample}/Mapped/${it}" -// else null -// } - -// input: -// set idPatient, idSample, file("${idSample}.bam") from bam_mapped_merged_to_index - -// output: -// set idPatient, idSample, file("${idSample}.bam"), file("${idSample}.bam.bai") into bam_mapped_merged_indexed -// set idPatient, idSample into tsv_bam_indexed - -// when: save_bam_mapped || !(params.known_indels) - -// script: -// """ -// samtools index ${idSample}.bam -// """ -// } - // if (!save_bam_mapped) tsv_bam_indexed.close() -// (tsv_bam_indexed, tsv_bam_indexed_sample) = tsv_bam_indexed.into(2) // // Creating a TSV file to restart from this step // tsv_bam_indexed.map { idPatient, idSample -> @@ -705,10 +612,10 @@ workflow.onComplete { // bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" // ["mapped_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] // } -// // STEP 2: MARKING DUPLICATES -// (tsv_bam_duplicates_marked, tsv_bam_duplicates_marked_sample) = tsv_bam_duplicates_marked.into(2) +// // STEP 2: MARKING DUPLICATES + // // Creating a TSV file to restart from this step // tsv_bam_duplicates_marked.map { idPatient, idSample -> @@ -734,77 +641,9 @@ workflow.onComplete { // if (step == 'preparerecalibration') bam_duplicates_marked = input_sample -// bam_duplicates_marked = bam_duplicates_marked.dump(tag:'MD BAM') -// duplicates_marked_report = duplicates_marked_report.dump(tag:'MD Report') - -// if (params.skip_markduplicates) bam_duplicates_marked = bam_mapped_merged_indexed - -// (bamMD, bamMDToJoin, bam_duplicates_marked) = bam_duplicates_marked.into(3) - -// - -// // STEP 2': SENTIEON DEDUP - -// process Sentieon_Dedup { -// label 'cpus_max' -// label 'memory_max' -// label 'sentieon' - -// tag "${idPatient}-${idSample}" - -// publishDir params.outdir, mode: params.publish_dir_mode, -// saveAs: { -// if (it == "${idSample}_*.txt" && 'sentieon' in skip_qc) null -// else if (it == "${idSample}_*.txt") "Reports/${idSample}/Sentieon/${it}" -// else "Preprocessing/${idSample}/DedupedSentieon/${it}" -// } - -// input: -// set idPatient, idSample, file(bam), file(bai) from bam_sentieon_mapped_merged_indexed -// file(fasta) from fasta -// file(fastaFai) from fai - -// output: -// set idPatient, idSample, file("${idSample}.deduped.bam"), file("${idSample}.deduped.bam.bai") into bam_sentieon_dedup - -// when: params.sentieon - -// script: -// """ -// sentieon driver \ -// -t ${task.cpus} \ -// -i ${bam} \ -// -r ${fasta} \ -// --algo GCBias --summary ${idSample}_gc_summary.txt ${idSample}_gc_metric.txt \ -// --algo MeanQualityByCycle ${idSample}_mq_metric.txt \ -// --algo QualDistribution ${idSample}_qd_metric.txt \ -// --algo InsertSizeMetricAlgo ${idSample}_is_metric.txt \ -// --algo AlignmentStat ${idSample}_aln_metric.txt - -// sentieon driver \ -// -t ${task.cpus} \ -// -i ${bam} \ -// --algo LocusCollector \ -// --fun score_info ${idSample}_score.gz - -// sentieon driver \ -// -t ${task.cpus} \ -// -i ${bam} \ -// --algo Dedup \ -// --rmdup \ -// --score_info ${idSample}_score.gz \ -// --metrics ${idSample}_dedup_metric.txt ${idSample}.deduped.bam -// """ -// } // // STEP 3: CREATING RECALIBRATION TABLES -// process BaseRecalibrator - - -// if (!params.no_intervals) tableGatherBQSRReports = tableGatherBQSRReports.groupTuple(by:[0, 1]) - -// tableGatherBQSRReports = tableGatherBQSRReports.dump(tag:'BQSR REPORTS') // if (params.no_intervals) { // (tableGatherBQSRReports, tableGatherBQSRReportsNoInt) = tableGatherBQSRReports.into(2) @@ -816,7 +655,6 @@ workflow.onComplete { // if ('baserecalibrator' in skip_qc) baseRecalibratorReport.close() -// recalTable = recalTable.dump(tag:'RECAL TABLE') // (recalTableTSV, recalTableSampleTSV) = recalTableTSV.mix(recalTableTSVnoInt).into(2) @@ -868,15 +706,6 @@ workflow.onComplete { // } // } -// bamApplyBQSR = bamMDToJoin.join(recalTable, by:[0,1]) - -// if (step == 'recalibrate') bamApplyBQSR = input_sample - -// bamApplyBQSR = bamApplyBQSR.dump(tag:'BAM + BAI + RECAL TABLE') - -// bamApplyBQSR = bamApplyBQSR.combine(intApplyBQSR) - -// bamApplyBQSR = bamApplyBQSR.dump(tag:'BAM + BAI + RECAL TABLE + INT') // // STEP 4: RECALIBRATING @@ -885,114 +714,6 @@ workflow.onComplete { // (bam_recalibrated_to_merge, bam_recalibrated_to_index) = bam_recalibrated_to_merge.groupTuple(by:[0, 1]).into(2) -// // STEP 4': SENTIEON BQSR - -// bam_sentieon_dedup = bam_sentieon_dedup.dump(tag:'deduped.bam') - -// process Sentieon_BQSR { -// label 'cpus_max' -// label 'memory_max' -// label 'sentieon' - -// tag "${idPatient}-${idSample}" - -// publishDir params.outdir, mode: params.publish_dir_mode, -// saveAs: { -// if (it == "${idSample}_recal_result.csv" && 'sentieon' in skip_qc) "Reports/${idSample}/Sentieon/${it}" -// else "Preprocessing/${idSample}/RecalSentieon/${it}" -// } - -// input: -// set idPatient, idSample, file(bam), file(bai) from bam_sentieon_dedup -// file(dbsnp) from dbsnp -// file(dbsnpIndex) from dbsnp_tbi -// file(fasta) from fasta -// file(dict) from dict -// file(fastaFai) from fai -// file(knownIndels) from known_indels -// file(knownIndelsIndex) from known_indels_tbi - -// output: -// set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bam_sentieon_recal -// set idPatient, idSample, file(bam), file(bai), file("${idSample}.recal.table") into bam_sentieon_deduped_table -// set idPatient, idSample into tsv_sentieon - -// when: params.sentieon - -// script: -// known = knownIndels.collect{"--known-sites ${it}"}.join(' ') -// """ -// sentieon driver \ -// -t ${task.cpus} \ -// -r ${fasta} \ -// -i ${idSample}.deduped.bam \ -// --algo QualCal \ -// -k ${dbsnp} \ -// ${idSample}.recal.table - -// sentieon driver \ -// -t ${task.cpus} \ -// -r ${fasta} \ -// -i ${idSample}.deduped.bam \ -// -q ${idSample}.recal.table \ -// --algo QualCal \ -// -k ${dbsnp} \ -// ${idSample}.table.post \ -// --algo ReadWriter ${idSample}.recal.bam - -// sentieon driver \ -// -t ${task.cpus} \ -// --algo QualCal \ -// --plot \ -// --before ${idSample}.recal.table \ -// --after ${idSample}.table.post \ -// ${idSample}_recal_result.csv -// """ -// } - -// (tsv_sentieon_deduped, tsv_sentieon_deduped_sample, tsv_sentieon_recal, tsv_sentieon_recal_sample) = tsv_sentieon.into(4) - -// // Creating a TSV file to restart from this step -// tsv_sentieon_deduped.map { idPatient, idSample -> -// gender = gender_map[idPatient] -// status = status_map[idPatient, idSample] -// bam = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam" -// bai = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam.bai" -// table = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.table" -// "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${table}\n" -// }.collectFile( -// name: 'sentieon_deduped.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" -// ) - -// tsv_sentieon_deduped_sample -// .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> -// status = status_map[idPatient, idSample] -// gender = gender_map[idPatient] -// bam = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam" -// bai = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam.bai" -// table = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.table" -// ["sentieon_deduped_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${table}\n"] -// } - -// // Creating a TSV file to restart from this step -// tsv_sentieon_recal.map { idPatient, idSample -> -// gender = gender_map[idPatient] -// status = status_map[idPatient, idSample] -// bam = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam" -// bai = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam.bai" -// "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" -// }.collectFile( -// name: 'sentieon_recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" -// ) - -// tsv_sentieon_recal_sample -// .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> -// status = status_map[idPatient, idSample] -// gender = gender_map[idPatient] -// bam = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam" -// bai = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam.bai" -// ["sentieon_recalibrated_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] -// } // // STEP 4.5: MERGING THE RECALIBRATED BAM FILES @@ -1022,28 +743,7 @@ workflow.onComplete { // // STEP 4.5': INDEXING THE RECALIBRATED BAM FILES -// process IndexBamRecal { -// label 'cpus_8' - -// tag "${idPatient}-${idSample}" - -// publishDir "${params.outdir}/Preprocessing/${idSample}/Recalibrated", mode: params.publish_dir_mode - -// input: -// set idPatient, idSample, file("${idSample}.recal.bam") from bam_recalibrated_to_index - -// output: -// set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bam_recalibrated_indexed -// set idPatient, idSample, file("${idSample}.recal.bam") into bam_recalibrated_no_int_qc -// set idPatient, idSample into tsv_bam_recalibrated_no_int -// when: params.no_intervals - -// script: -// """ -// samtools index ${idSample}.recal.bam -// """ -// } // bam_recalibrated = bam_recalibrated.mix(bam_recalibrated_indexed) // bam_recalibrated_qc = bam_recalibrated_qc.mix(bam_recalibrated_no_int_qc) @@ -1075,66 +775,6 @@ workflow.onComplete { // // STEP 5: QC -// process SamtoolsStats { -// label 'cpus_2' - -// tag "${idPatient}-${idSample}" - -// publishDir "${params.outdir}/Reports/${idSample}/SamToolsStats", mode: params.publish_dir_mode - -// input: -// set idPatient, idSample, file(bam) from bam_recalibrated_samtools_stats - -// output: -// file ("${bam}.samtools.stats.out") into samtoolsStatsReport - -// when: !('samtools' in skip_qc) - -// script: -// """ -// samtools stats ${bam} > ${bam}.samtools.stats.out -// """ -// } - -// samtoolsStatsReport = samtoolsStatsReport.dump(tag:'SAMTools') - -// bamBamQC = bamMappedBamQC.mix(bam_recalibrated_bamqc) - -// process BamQC { -// label 'memory_max' -// label 'cpus_16' - -// tag "${idPatient}-${idSample}" - -// publishDir "${params.outdir}/Reports/${idSample}/bamQC", mode: params.publish_dir_mode - -// input: -// set idPatient, idSample, file(bam) from bamBamQC -// file(targetBED) from ch_target_bed - -// output: -// file("${bam.baseName}") into bamQCReport - -// when: !('bamqc' in skip_qc) - -// script: -// use_bed = params.target_bed ? "-gff ${targetBED}" : '' -// """ -// qualimap --java-mem-size=${task.memory.toGiga()}G \ -// bamqc \ -// -bam ${bam} \ -// --paint-chromosome-limits \ -// --genome-gc-distr HUMAN \ -// $use_bed \ -// -nt ${task.cpus} \ -// -skip-duplicated \ -// --skip-dup-mode 0 \ -// -outdir ${bam.baseName} \ -// -outformat HTML -// """ -// } - -// bamQCReport = bamQCReport.dump(tag:'BamQC') // /* // ================================================================================ diff --git a/modules/nf-core/software/gatk_markduplicates.nf b/modules/nf-core/software/gatk_markduplicates.nf index 813b1a9fa6..8833e2ee66 100644 --- a/modules/nf-core/software/gatk_markduplicates.nf +++ b/modules/nf-core/software/gatk_markduplicates.nf @@ -1,6 +1,6 @@ process GATK_MARKDUPLICATES { label 'cpus_16' - tag "${meta.id}" + //tag "${meta.id}" publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { diff --git a/modules/nf-core/software/qualimap_bamqc.nf b/modules/nf-core/software/qualimap_bamqc.nf index ffb5afc57a..28fa40c435 100644 --- a/modules/nf-core/software/qualimap_bamqc.nf +++ b/modules/nf-core/software/qualimap_bamqc.nf @@ -1,33 +1,33 @@ process QUALIMAP_BAMQC { - label 'memory_max' - label 'cpus_16' + // label 'memory_max' + // label 'cpus_16' - tag "${meta.id}" + // tag "${meta.id}" - publishDir "${params.outdir}/Reports/${meta.id}/bamQC", mode: params.publish_dir_mode + // publishDir "${params.outdir}/Reports/${meta.id}/bamQC", mode: params.publish_dir_mode input: - tuple val(meta), path(bam) - path(targetBED) + tuple val(meta), path(bam) + path(targetBED) - output: - path("${bam.baseName}") + output: + path("${bam.baseName}") - //when: !('bamqc' in skip_qc) + // //when: !('bamqc' in skip_qc) - script: - use_bed = params.target_bed ? "-gff ${targetBED}" : '' - """ - qualimap --java-mem-size=${task.memory.toGiga()}G \ - bamqc \ - -bam ${bam} \ - --paint-chromosome-limits \ - --genome-gc-distr HUMAN \ - $use_bed \ - -nt ${task.cpus} \ - -skip-duplicated \ - --skip-dup-mode 0 \ - -outdir ${bam.baseName} \ - -outformat HTML - """ + script: + // use_bed = params.target_bed ? "-gff ${targetBED}" : '' + """ + # // qualimap --java-mem-size=${task.memory.toGiga()}G \ + #// bamqc \ + #// -bam ${bam} \ + #// --paint-chromosome-limits \ + #// --genome-gc-distr HUMAN \ + #// $use_bed \ + #// -nt ${task.cpus} \ + #// -skip-duplicated \ + #// --skip-dup-mode 0 \ + #// -outdir ${bam.baseName} \ + #// -outformat HTML + """ } diff --git a/modules/nf-core/software/samtools_index.nf b/modules/nf-core/software/samtools_index.nf index daa14c099a..db2a1f3221 100644 --- a/modules/nf-core/software/samtools_index.nf +++ b/modules/nf-core/software/samtools_index.nf @@ -3,14 +3,22 @@ process SAMTOOLS_INDEX { tag "${meta.id}" +// publishDir params.outdir, mode: params.publish_dir_mode, +// saveAs: { +// if (save_bam_mapped) "Preprocessing/${idSample}/Mapped/${it}" +// else null +// } + input: tuple val(meta), path(bam) output: tuple val(meta), path(bam), path("*.bai") + //set idPatient, idSample into tsv_bam_indexed script: """ samtools index $bam """ + // samtools index ${idSample}.recal.bam TODO: is the naming here relevant? } \ No newline at end of file From 00de14cd503e8a4d894c6b4581eb83bf21fbc8f8 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Fri, 31 Jul 2020 18:33:37 +0200 Subject: [PATCH 089/200] Add multiqc back in --- main.nf | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/main.nf b/main.nf index c9ca22f80c..97c0578d70 100644 --- a/main.nf +++ b/main.nf @@ -518,17 +518,17 @@ workflow { GET_SOFTWARE_VERSIONS() - // MULTIQC( - // GET_SOFTWARE_VERSIONS.out.yml, - // QC_TRIM.out.fastqc_html.collect().ifEmpty([]), - // QC_TRIM.out.fastqc_zip.collect().ifEmpty([]), - // QC_TRIM.out.trimgalore_html.collect().ifEmpty([]), - // QC_TRIM.out.trimgalore_log.collect().ifEmpty([]), - // QC_TRIM.out.trimgalore_zip.collect().ifEmpty([]), - // multiqc_config, - // multiqc_custom_config.ifEmpty([]), - // report_markduplicates.collect().ifEmpty([]), - // workflow_summary) + MULTIQC( + GET_SOFTWARE_VERSIONS.out.yml, + QC_TRIM.out.fastqc_html.collect().ifEmpty([]), + QC_TRIM.out.fastqc_zip.collect().ifEmpty([]), + QC_TRIM.out.trimgalore_html.collect().ifEmpty([]), + QC_TRIM.out.trimgalore_log.collect().ifEmpty([]), + QC_TRIM.out.trimgalore_zip.collect().ifEmpty([]), + multiqc_config, + multiqc_custom_config.ifEmpty([]), + report_markduplicates.collect().ifEmpty([]), + workflow_summary) } /* From 6d12fb3305231016dab34ec685bc11d1e6130c50 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Fri, 31 Jul 2020 18:35:35 +0200 Subject: [PATCH 090/200] Remove obsolet when statement --- modules/local/process/merge_bam.nf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/modules/local/process/merge_bam.nf b/modules/local/process/merge_bam.nf index 88364a9097..9727e4a7d1 100644 --- a/modules/local/process/merge_bam.nf +++ b/modules/local/process/merge_bam.nf @@ -2,17 +2,18 @@ process MERGE_BAM { label 'cpus_8' tag "${meta.id}" - + //TODO publishDir + input: tuple val(meta), path(bam) output: tuple val(meta), path("${meta.sample}.bam") - // when: !(params.no_intervals) - script: """ samtools merge --threads ${task.cpus} ${meta.sample}.bam ${bam} """ + //TODO Naming? + //samtools merge --threads ${task.cpus} ${idSample}.recal.bam ${bam} } \ No newline at end of file From fec939139277c6c34d849aeb6d9311e1a9b6496c Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Fri, 31 Jul 2020 18:36:18 +0200 Subject: [PATCH 091/200] Comment tag back in --- modules/nf-core/software/gatk_markduplicates.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/nf-core/software/gatk_markduplicates.nf b/modules/nf-core/software/gatk_markduplicates.nf index 8833e2ee66..813b1a9fa6 100644 --- a/modules/nf-core/software/gatk_markduplicates.nf +++ b/modules/nf-core/software/gatk_markduplicates.nf @@ -1,6 +1,6 @@ process GATK_MARKDUPLICATES { label 'cpus_16' - //tag "${meta.id}" + tag "${meta.id}" publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { From 18e242a1909c22bab741f7272f441b9fbd4471e7 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Fri, 31 Jul 2020 18:37:40 +0200 Subject: [PATCH 092/200] Add qualimap_bamqc back in --- modules/nf-core/software/qualimap_bamqc.nf | 32 +++++++++++----------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/modules/nf-core/software/qualimap_bamqc.nf b/modules/nf-core/software/qualimap_bamqc.nf index 28fa40c435..ca1e84ed7e 100644 --- a/modules/nf-core/software/qualimap_bamqc.nf +++ b/modules/nf-core/software/qualimap_bamqc.nf @@ -1,10 +1,10 @@ process QUALIMAP_BAMQC { - // label 'memory_max' - // label 'cpus_16' + label 'memory_max' + label 'cpus_16' - // tag "${meta.id}" + tag "${meta.id}" - // publishDir "${params.outdir}/Reports/${meta.id}/bamQC", mode: params.publish_dir_mode + publishDir "${params.outdir}/Reports/${meta.id}/bamQC", mode: params.publish_dir_mode input: tuple val(meta), path(bam) @@ -16,18 +16,18 @@ process QUALIMAP_BAMQC { // //when: !('bamqc' in skip_qc) script: - // use_bed = params.target_bed ? "-gff ${targetBED}" : '' + use_bed = params.target_bed ? "-gff ${targetBED}" : '' """ - # // qualimap --java-mem-size=${task.memory.toGiga()}G \ - #// bamqc \ - #// -bam ${bam} \ - #// --paint-chromosome-limits \ - #// --genome-gc-distr HUMAN \ - #// $use_bed \ - #// -nt ${task.cpus} \ - #// -skip-duplicated \ - #// --skip-dup-mode 0 \ - #// -outdir ${bam.baseName} \ - #// -outformat HTML + qualimap --java-mem-size=${task.memory.toGiga()}G \ + bamqc \ + -bam ${bam} \ + --paint-chromosome-limits \ + --genome-gc-distr HUMAN \ + $use_bed \ + -nt ${task.cpus} \ + -skip-duplicated \ + --skip-dup-mode 0 \ + -outdir ${bam.baseName} \ + -outformat HTML """ } From 0f6151bc22a64e835f0f63ed0cad0bdcc8bbb3a6 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Fri, 31 Jul 2020 18:39:06 +0200 Subject: [PATCH 093/200] Add conditional samtools stats --- main.nf | 3 ++- modules/nf-core/software/samtools_stats.nf | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 97c0578d70..558379bd06 100644 --- a/main.nf +++ b/main.nf @@ -481,7 +481,8 @@ workflow { } // STEP 5: QC - SAMTOOLS_STATS(MERGE_BAM_RECAL.out) + if(!('samtools' in skip_qc)) + SAMTOOLS_STATS(MERGE_BAM_RECAL.out) //TODO This should work but somehow BAMQC is not called bamqc = BWAMEM2_MEM.out.mix(MERGE_BAM_RECAL.out) bamqc.dump() diff --git a/modules/nf-core/software/samtools_stats.nf b/modules/nf-core/software/samtools_stats.nf index 857ff43be7..a3cbc23d7f 100644 --- a/modules/nf-core/software/samtools_stats.nf +++ b/modules/nf-core/software/samtools_stats.nf @@ -11,8 +11,6 @@ process SAMTOOLS_STATS { output: path ("${bam}.samtools.stats.out") - //when: !('samtools' in skip_qc) - script: """ samtools stats ${bam} > ${bam}.samtools.stats.out From f18ecdab0894f75ba43ad05d9be87fdccd060151 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Sat, 1 Aug 2020 00:20:00 +0200 Subject: [PATCH 094/200] Traced the error to target_bed, however also the bam files are reportedly corrupted... --- main.nf | 7 ++++--- modules/nf-core/software/qualimap_bamqc.nf | 6 +++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/main.nf b/main.nf index 558379bd06..7f56791744 100644 --- a/main.nf +++ b/main.nf @@ -220,7 +220,8 @@ cadd_wg_snvs = params.cadd_wg_snvs ?: Channel.empty() cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ?: Channel.empty() pon = params.pon ?: Channel.empty() snpeff_cache = params.snpeff_cache ?: Channel.empty() -target_bed = params.target_bed ?: Channel.empty() +//ch_target_bed = params.target_bed ? Channel.value(file(params.target_bed)) : "null" +target_bed = params.target_bed ?: Channel.empty() vep_cache = params.vep_cache ?: Channel.empty() // Initialize value channels based on params, not defined within the params.genomes[params.genome] scope @@ -485,8 +486,8 @@ workflow { SAMTOOLS_STATS(MERGE_BAM_RECAL.out) //TODO This should work but somehow BAMQC is not called bamqc = BWAMEM2_MEM.out.mix(MERGE_BAM_RECAL.out) - bamqc.dump() - BAMQC(bamqc, target_bed) + //if(!('bamqc' in skipQC)) + BAMQC(bamqc)//, target_bed) /* diff --git a/modules/nf-core/software/qualimap_bamqc.nf b/modules/nf-core/software/qualimap_bamqc.nf index ca1e84ed7e..26bea1041d 100644 --- a/modules/nf-core/software/qualimap_bamqc.nf +++ b/modules/nf-core/software/qualimap_bamqc.nf @@ -2,13 +2,13 @@ process QUALIMAP_BAMQC { label 'memory_max' label 'cpus_16' - tag "${meta.id}" + //tag "${meta.id}" publishDir "${params.outdir}/Reports/${meta.id}/bamQC", mode: params.publish_dir_mode input: tuple val(meta), path(bam) - path(targetBED) + //path(targetBED) output: path("${bam.baseName}") @@ -16,7 +16,7 @@ process QUALIMAP_BAMQC { // //when: !('bamqc' in skip_qc) script: - use_bed = params.target_bed ? "-gff ${targetBED}" : '' + use_bed = ''//params.target_bed ? "-gff ${targetBED}" : '' """ qualimap --java-mem-size=${task.memory.toGiga()}G \ bamqc \ From 40e2adb4a426750a5af6aaa3eeb2534829a9e2f7 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Sat, 1 Aug 2020 00:29:03 +0200 Subject: [PATCH 095/200] Merge_Bam_recal bams are the problem, other bams are fine --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 7f56791744..7de51b96f3 100644 --- a/main.nf +++ b/main.nf @@ -485,7 +485,7 @@ workflow { if(!('samtools' in skip_qc)) SAMTOOLS_STATS(MERGE_BAM_RECAL.out) //TODO This should work but somehow BAMQC is not called - bamqc = BWAMEM2_MEM.out.mix(MERGE_BAM_RECAL.out) + bamqc = BWAMEM2_MEM.out//.mix(MERGE_BAM_RECAL.out) //if(!('bamqc' in skipQC)) BAMQC(bamqc)//, target_bed) From c988a96b60e59a6e21528ae23ecb4991eb6bd649 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 4 Aug 2020 16:34:27 +0200 Subject: [PATCH 096/200] fix some issues + code polishing --- main.nf | 101 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 52 insertions(+), 49 deletions(-) diff --git a/main.nf b/main.nf index 7de51b96f3..b446c3bef6 100644 --- a/main.nf +++ b/main.nf @@ -197,16 +197,18 @@ params.snpeff_db = params.genome ? params.genomes[params.genome].s params.species = params.genome ? params.genomes[params.genome].species ?: false : false params.vep_cache_version = params.genome ? params.genomes[params.genome].vep_cache_version ?: false : false +file("${params.outdir}/no_file").text = "no_file\n" + // Initialize file channels based on params, defined in the params.genomes[params.genome] scope -chr_dir = params.chr_dir ?: Channel.empty() -chr_length = params.chr_length ?: Channel.empty() -dbsnp = params.dbsnp ?: Channel.empty() -fasta = params.fasta ?: Channel.empty() -germline_resource = params.germline_resource ?: Channel.empty() -known_indels = params.known_indels ?: Channel.empty() -loci = params.ac_loci ?: Channel.empty() -loci_gc = params.ac_loci_gc ?: Channel.empty() -mappability = params.mappability ?: Channel.empty() +chr_dir = params.chr_dir ? file(params.chr_dir) : file("${params.outdir}/no_file") +chr_length = params.chr_length ? file(params.chr_length) : file("${params.outdir}/no_file") +dbsnp = params.dbsnp ? file(params.dbsnp) : file("${params.outdir}/no_file") +fasta = params.fasta ? file(params.fasta) : file("${params.outdir}/no_file") +germline_resource = params.germline_resource ? file(params.germline_resource) : file("${params.outdir}/no_file") +known_indels = params.known_indels ? file(params.known_indels) : file("${params.outdir}/no_file") +loci = params.ac_loci ? file(params.ac_loci) : file("${params.outdir}/no_file") +loci_gc = params.ac_loci_gc ? file(params.ac_loci_gc) : file("${params.outdir}/no_file") +mappability = params.mappability ? file(params.mappability) : file("${params.outdir}/no_file") // Initialize value channels based on params, defined in the params.genomes[params.genome] scope snpeff_db = params.snpeff_db ?: Channel.empty() @@ -214,15 +216,14 @@ snpeff_species = params.species ?: Channel.empty() vep_cache_version = params.vep_cache_version ?: Channel.empty() // Initialize files channels based on params, not defined within the params.genomes[params.genome] scope -cadd_indels = params.cadd_indels ?: Channel.empty() -cadd_indels_tbi = params.cadd_indels_tbi ?: Channel.empty() -cadd_wg_snvs = params.cadd_wg_snvs ?: Channel.empty() -cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ?: Channel.empty() -pon = params.pon ?: Channel.empty() -snpeff_cache = params.snpeff_cache ?: Channel.empty() -//ch_target_bed = params.target_bed ? Channel.value(file(params.target_bed)) : "null" -target_bed = params.target_bed ?: Channel.empty() -vep_cache = params.vep_cache ?: Channel.empty() +cadd_indels = params.cadd_indels ? file(params.cadd_indels) : file("${params.outdir}/no_file") +cadd_indels_tbi = params.cadd_indels_tbi ? file(params.cadd_indels_tbi) : file("${params.outdir}/no_file") +cadd_wg_snvs = params.cadd_wg_snvs ? file(params.cadd_wg_snvs) : file("${params.outdir}/no_file") +cadd_wg_snvs_tbi = params.cadd_wg_snvs_tbi ? file(params.cadd_wg_snvs_tbi) : file("${params.outdir}/no_file") +pon = params.pon ? file(params.pon) : file("${params.outdir}/no_file") +snpeff_cache = params.snpeff_cache ? file(params.snpeff_cache) : file("${params.outdir}/no_file") +target_bed = params.target_bed ? file(params.target_bed) : file("${params.outdir}/no_file") +vep_cache = params.vep_cache ? file(params.vep_cache) : file("${params.outdir}/no_file") // Initialize value channels based on params, not defined within the params.genomes[params.genome] scope read_structure1 = params.read_structure1 ?: Channel.empty() @@ -354,14 +355,14 @@ workflow { step, tools) - bwa = params.bwa ?: BUILD_INDICES.out.bwa - dbsnp_tbi = params.dbsnp ? params.dbsnp_index ?: BUILD_INDICES.out.dbsnp_tbi : Channel.empty() - dict = params.dict ?: BUILD_INDICES.out.dict - fai = params.fasta_fai ?: BUILD_INDICES.out.fai - germline_resource_tbi = params.germline_resource ? params.germline_resource_index ?: BUILD_INDICES.out.germline_resource_tbi : Channel.empty() + bwa = params.bwa ? file(params.bwa) : BUILD_INDICES.out.bwa + dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? file(params.dbsnp_index) : BUILD_INDICES.out.dbsnp_tbi : Channel.empty() + dict = params.dict ? file(params.dict) : BUILD_INDICES.out.dict + fai = params.fasta_fai ? file(params.fasta_fai) : BUILD_INDICES.out.fai + germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? file(params.germline_resource_index) : BUILD_INDICES.out.germline_resource_tbi : Channel.empty() intervals = BUILD_INDICES.out.intervals - known_indels_tbi = params.known_indels ? params.known_indels_index ?: BUILD_INDICES.out.known_indels_tbi.collect() : Channel.empty() - pon_tbi = params.pon ? params.pon_index ?: BUILD_INDICES.out.pon_tbi : Channel.empty() + known_indels_tbi = params.known_indels ? params.known_indels_index ? file(params.known_indels_index) : BUILD_INDICES.out.known_indels_tbi.collect() : Channel.empty() + pon_tbi = params.pon ? params.pon_index ? file(params.pon_index) : BUILD_INDICES.out.pon_tbi : Channel.empty() /* ================================================================================ @@ -422,22 +423,21 @@ workflow { bam_mapped = bam_single.mix(MERGE_BAM_MAPPED(bam_multiple)) //if(save_bam_mapped || !(params.known_indels)) //TODO: https://github.com/nf-core/sarek/blob/bce378e09de25bb26c388b917f93f84806d3ba27/main.nf#L1478 - //But if SAMTOOLS_INDEX is not run, markduplicates does not work - bam_mapped = SAMTOOLS_INDEX_MAPPED(bam_mapped) + //But if SAMTOOLS_INDEX is not run, markduplicates does not work + bam_mapped = SAMTOOLS_INDEX_MAPPED(bam_mapped) // STEP 2: MARKING DUPLICATES - report_markduplicates = Channel.empty() + markduplicates_report = Channel.empty() + markduplicates_bam = bam_mapped - bam_markduplicates = bam_mapped - if (!(params.skip_markduplicates)) { - bam_mapped.dump() + if (!params.skip_markduplicates) { MARKDUPLICATES(bam_mapped) - report_markduplicates = MARKDUPLICATES.out.report - bam_markduplicates = MARKDUPLICATES.out.bam + markduplicates_report = MARKDUPLICATES.out.report + markduplicates_bam = MARKDUPLICATES.out.bam } // STEP 3: CREATING RECALIBRATION TABLES - bam_baserecalibrator = bam_markduplicates.combine(BUILD_INDICES.out.intervals) + bam_baserecalibrator = markduplicates_bam.combine(intervals) BASERECALIBRATOR(bam_baserecalibrator, dbsnp, dbsnp_tbi, dict, fai, fasta, known_indels, known_indels_tbi) // STEP 3.5: MERGING RECALIBRATION TABLES @@ -464,31 +464,34 @@ workflow { } GATHERBQSRREPORTS(recaltable) - // if ('baserecalibrator' in skip_qc) baseRecalibratorReport.close() + table = GATHERBQSRREPORTS.out.table + } else { + table = BASERECALIBRATOR.out.report } // STEP 4: RECALIBRATING - bam_applybqsr = MARKDUPLICATES.out.bam.join(GATHERBQSRREPORTS.out.table) //by:[0] - bam_applybqsr = bam_applybqsr.combine(BUILD_INDICES.out.intervals) + applybqsr_bam = markduplicates_bam.join(table) + + applybqsr_bam = applybqsr_bam.combine(intervals) // if (step == 'recalibrate') bamApplyBQSR = input_sample - APPLYBQSR(bam_applybqsr, dict, fasta, fai) + APPLYBQSR(applybqsr_bam, dict, fasta, fai) // STEP 4.5: MERGING AND INDEXING THE RECALIBRATED BAM FILES - if (!(params.no_intervals)){ + if (!params.no_intervals) { MERGE_BAM_RECAL(APPLYBQSR.out) - SAMTOOLS_INDEX_RECAL(MERGE_BAM_RECAL.out) - }else{ - SAMTOOLS_INDEX_RECAL(APPLYBQSR.out) + recal = MERGE_BAM_RECAL.out + } else { + recal = APPLYBQSR.out } // STEP 5: QC - if(!('samtools' in skip_qc)) - SAMTOOLS_STATS(MERGE_BAM_RECAL.out) - //TODO This should work but somehow BAMQC is not called - bamqc = BWAMEM2_MEM.out//.mix(MERGE_BAM_RECAL.out) - //if(!('bamqc' in skipQC)) - BAMQC(bamqc)//, target_bed) + if (!('samtools' in skip_qc)) { + SAMTOOLS_STATS(BWAMEM2_MEM.out.mix(recal)) + } + if (!('bamqc' in skip_qc)) { + BAMQC(BWAMEM2_MEM.out.mix(recal), target_bed) + } /* ================================================================================ @@ -529,7 +532,7 @@ workflow { QC_TRIM.out.trimgalore_zip.collect().ifEmpty([]), multiqc_config, multiqc_custom_config.ifEmpty([]), - report_markduplicates.collect().ifEmpty([]), + markduplicates_report.collect().ifEmpty([]), workflow_summary) } From cdfff6aafae1a8b9e9dc06eb257d05ba85041710 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 4 Aug 2020 16:34:41 +0200 Subject: [PATCH 097/200] code polishing --- modules/local/process/merge_bam.nf | 2 -- 1 file changed, 2 deletions(-) diff --git a/modules/local/process/merge_bam.nf b/modules/local/process/merge_bam.nf index 9727e4a7d1..623d135e34 100644 --- a/modules/local/process/merge_bam.nf +++ b/modules/local/process/merge_bam.nf @@ -14,6 +14,4 @@ process MERGE_BAM { """ samtools merge --threads ${task.cpus} ${meta.sample}.bam ${bam} """ - //TODO Naming? - //samtools merge --threads ${task.cpus} ${idSample}.recal.bam ${bam} } \ No newline at end of file From dd466c59e61b8b84fd750f11823ab56aff52697f Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 4 Aug 2020 16:34:57 +0200 Subject: [PATCH 098/200] code polishing --- modules/local/subworkflow/build_indices.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/subworkflow/build_indices.nf b/modules/local/subworkflow/build_indices.nf index 4c5a23841c..2e5724c446 100644 --- a/modules/local/subworkflow/build_indices.nf +++ b/modules/local/subworkflow/build_indices.nf @@ -61,9 +61,9 @@ workflow BUILD_INDICES{ result_intervals = Channel.from(file("${params.outdir}/no_intervals.bed")) } else if (!('annotate' in step) && !('controlfreec' in step)) if (!params.intervals) - result_intervals = CREATE_INTERVALS_BED(BUILD_INTERVALS(SAMTOOLS_FAIDX.out)) + result_intervals = CREATE_INTERVALS_BED(result_fai) else - result_intervals = CREATE_INTERVALS_BED(params.intervals) + result_intervals = CREATE_INTERVALS_BED(file(params.intervals)) if (!params.no_intervals) { result_intervals = result_intervals.flatten() From 45f51dc07a03104103c30464bf6e8fdaf4995347 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 4 Aug 2020 16:35:19 +0200 Subject: [PATCH 099/200] fix target_bed --- modules/nf-core/software/qualimap_bamqc.nf | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/modules/nf-core/software/qualimap_bamqc.nf b/modules/nf-core/software/qualimap_bamqc.nf index 26bea1041d..233add760f 100644 --- a/modules/nf-core/software/qualimap_bamqc.nf +++ b/modules/nf-core/software/qualimap_bamqc.nf @@ -2,28 +2,26 @@ process QUALIMAP_BAMQC { label 'memory_max' label 'cpus_16' - //tag "${meta.id}" + tag "${meta.id}" publishDir "${params.outdir}/Reports/${meta.id}/bamQC", mode: params.publish_dir_mode input: tuple val(meta), path(bam) - //path(targetBED) + path(target_bed) output: path("${bam.baseName}") - // //when: !('bamqc' in skip_qc) - script: - use_bed = ''//params.target_bed ? "-gff ${targetBED}" : '' + use_bed = params.target_bed ? "-gff ${target_bed}" : '' """ qualimap --java-mem-size=${task.memory.toGiga()}G \ bamqc \ -bam ${bam} \ --paint-chromosome-limits \ --genome-gc-distr HUMAN \ - $use_bed \ + ${use_bed} \ -nt ${task.cpus} \ -skip-duplicated \ --skip-dup-mode 0 \ From d3c847988e98123a6e001f1cd57db0a6770215c4 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 4 Aug 2020 16:35:34 +0200 Subject: [PATCH 100/200] code polishing --- modules/nf-core/software/samtools_index.nf | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modules/nf-core/software/samtools_index.nf b/modules/nf-core/software/samtools_index.nf index db2a1f3221..25372a1063 100644 --- a/modules/nf-core/software/samtools_index.nf +++ b/modules/nf-core/software/samtools_index.nf @@ -14,11 +14,9 @@ process SAMTOOLS_INDEX { output: tuple val(meta), path(bam), path("*.bai") - //set idPatient, idSample into tsv_bam_indexed script: """ - samtools index $bam + samtools index ${bam} """ - // samtools index ${idSample}.recal.bam TODO: is the naming here relevant? } \ No newline at end of file From 11e478919cf159ff3afb1936b6c26e0c1b2001de Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 4 Aug 2020 16:50:31 +0200 Subject: [PATCH 101/200] merged recal bams are now longer empty --- main.nf | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index b446c3bef6..0a392ef017 100644 --- a/main.nf +++ b/main.nf @@ -478,7 +478,28 @@ workflow { // STEP 4.5: MERGING AND INDEXING THE RECALIBRATED BAM FILES if (!params.no_intervals) { - MERGE_BAM_RECAL(APPLYBQSR.out) + APPLYBQSR.out.map{ meta, bam -> //, bai -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + [patient, sample, gender, status, bam] //, bai] + }.groupTuple(by: [0,1]).set{ bam_recal_to_merge } + + bam_recal_to_merge = bam_recal_to_merge.map { + patient, sample, gender, status, bam -> //, bai -> + + def meta = [:] + meta.patient = patient + meta.sample = sample + meta.gender = gender[0] + meta.status = status[0] + meta.id = sample + + [meta, bam] + } + + MERGE_BAM_RECAL(bam_recal_to_merge) recal = MERGE_BAM_RECAL.out } else { recal = APPLYBQSR.out From 288de4124e88df1a3b52ef4051677f5fa2c10a0d Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 5 Aug 2020 09:06:14 +0200 Subject: [PATCH 102/200] code polishing --- main.nf | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/main.nf b/main.nf index 0a392ef017..334214614b 100644 --- a/main.nf +++ b/main.nf @@ -6,6 +6,7 @@ ================================================================================ Started March 2016. Ported to nf-core May 2019. +Ported to DSL 2 July 2020. -------------------------------------------------------------------------------- nf-core/sarek: An open-source analysis pipeline to detect germline or somatic variants @@ -355,14 +356,14 @@ workflow { step, tools) - bwa = params.bwa ? file(params.bwa) : BUILD_INDICES.out.bwa - dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? file(params.dbsnp_index) : BUILD_INDICES.out.dbsnp_tbi : Channel.empty() - dict = params.dict ? file(params.dict) : BUILD_INDICES.out.dict - fai = params.fasta_fai ? file(params.fasta_fai) : BUILD_INDICES.out.fai - germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? file(params.germline_resource_index) : BUILD_INDICES.out.germline_resource_tbi : Channel.empty() + bwa = params.bwa ? file(params.bwa) : BUILD_INDICES.out.bwa + dict = params.dict ? file(params.dict) : BUILD_INDICES.out.dict + fai = params.fasta_fai ? file(params.fasta_fai) : BUILD_INDICES.out.fai intervals = BUILD_INDICES.out.intervals - known_indels_tbi = params.known_indels ? params.known_indels_index ? file(params.known_indels_index) : BUILD_INDICES.out.known_indels_tbi.collect() : Channel.empty() - pon_tbi = params.pon ? params.pon_index ? file(params.pon_index) : BUILD_INDICES.out.pon_tbi : Channel.empty() + germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? file(params.germline_resource_index) : BUILD_INDICES.out.germline_resource_tbi : file("${params.outdir}/no_file") + dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? file(params.dbsnp_index) : BUILD_INDICES.out.dbsnp_tbi : file("${params.outdir}/no_file") + known_indels_tbi = params.known_indels ? params.known_indels_index ? file(params.known_indels_index) : BUILD_INDICES.out.known_indels_tbi.collect() : file("${params.outdir}/no_file") + pon_tbi = params.pon ? params.pon_index ? file(params.pon_index) : BUILD_INDICES.out.pon_tbi : file("${params.outdir}/no_file") /* ================================================================================ From c792f8e714f7260a4fb423409419baef37256d2f Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 5 Aug 2020 14:50:42 +0200 Subject: [PATCH 103/200] more options for modules --- conf/modules.config | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 3e3d051022..6d1df81e7d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -9,7 +9,7 @@ params { 'fastqc' { args = "--quiet" suffix = "" - publish_dir = "fastqc" + publish_dir = "Reports/FastQC" publish_results = "all" } 'trimgalore' { @@ -32,6 +32,18 @@ params { publish_dir = "" publish_results = "all" } + 'samtools_index_mapped' { + args = "" + suffix = "" + publish_dir = "Mapped" + publish_results = "all" + } + 'samtools_index_recal' { + args = "" + suffix = "recal" + publish_dir = "" + publish_results = "all" + } 'gatk_markduplicates' { args = "ASSUME_SORTED=true REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT TMP_DIR=tmp" suffix = ".mLb.mkD" From 61ac666aafbb67ff0d9872067c8c38dc3719a227 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 5 Aug 2020 14:51:27 +0200 Subject: [PATCH 104/200] tsv files generated --- main.nf | 401 ++++++++++++++++++++------------------------------------ 1 file changed, 144 insertions(+), 257 deletions(-) diff --git a/main.nf b/main.nf index 334214614b..1a08a9d127 100644 --- a/main.nf +++ b/main.nf @@ -260,11 +260,11 @@ if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works ================================================================================ */ -include { BWAMEM2_MEM } from './modules/local/process/bwamem2_mem' -include { GET_SOFTWARE_VERSIONS } from './modules/local/process/get_software_versions' -include { OUTPUT_DOCUMENTATION } from './modules/local/process/output_documentation' -include { MERGE_BAM as MERGE_BAM_MAPPED; - MERGE_BAM as MERGE_BAM_RECAL;} from './modules/local/process/merge_bam' +include { BWAMEM2_MEM } from './modules/local/process/bwamem2_mem' +include { GET_SOFTWARE_VERSIONS } from './modules/local/process/get_software_versions' +include { OUTPUT_DOCUMENTATION } from './modules/local/process/output_documentation' +include { MERGE_BAM as MERGE_BAM_MAPPED } from './modules/local/process/merge_bam' +include { MERGE_BAM as MERGE_BAM_RECAL } from './modules/local/process/merge_bam' /* ================================================================================ @@ -356,12 +356,14 @@ workflow { step, tools) + intervals = BUILD_INDICES.out.intervals + bwa = params.bwa ? file(params.bwa) : BUILD_INDICES.out.bwa dict = params.dict ? file(params.dict) : BUILD_INDICES.out.dict fai = params.fasta_fai ? file(params.fasta_fai) : BUILD_INDICES.out.fai - intervals = BUILD_INDICES.out.intervals - germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? file(params.germline_resource_index) : BUILD_INDICES.out.germline_resource_tbi : file("${params.outdir}/no_file") + dbsnp_tbi = params.dbsnp ? params.dbsnp_index ? file(params.dbsnp_index) : BUILD_INDICES.out.dbsnp_tbi : file("${params.outdir}/no_file") + germline_resource_tbi = params.germline_resource ? params.germline_resource_index ? file(params.germline_resource_index) : BUILD_INDICES.out.germline_resource_tbi : file("${params.outdir}/no_file") known_indels_tbi = params.known_indels ? params.known_indels_index ? file(params.known_indels_index) : BUILD_INDICES.out.known_indels_tbi.collect() : file("${params.outdir}/no_file") pon_tbi = params.pon ? params.pon_index ? file(params.pon_index) : BUILD_INDICES.out.pon_tbi : file("${params.outdir}/no_file") @@ -370,7 +372,9 @@ workflow { PREPROCESSING ================================================================================ */ + // STEP 0.5: QC ON READS + QC_TRIM( input_sample, ('fastqc' in skip_qc), @@ -379,22 +383,28 @@ workflow { params.modules['trimgalore'] ) + reads_input = QC_TRIM.out.reads + // STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA MEM - BWAMEM2_MEM(QC_TRIM.out.reads, bwa, fasta, fai, params.modules['bwamem2_mem']) - BWAMEM2_MEM.out.map{ meta, bam -> //, bai -> + + BWAMEM2_MEM(reads_input, bwa, fasta, fai, params.modules['bwamem2_mem']) + + bam_bwamem2 = BWAMEM2_MEM.out + + bam_bwamem2.map{ meta, bam -> patient = meta.patient sample = meta.sample gender = meta.gender status = meta.status - [patient, sample, gender, status, bam] //, bai] + [patient, sample, gender, status, bam] }.groupTuple(by: [0,1]) .branch{ single: it[4].size() == 1 multiple: it[4].size() > 1 - }.set{ bam } + }.set{ bam_bwamem2_to_sort } - bam_single = bam.single.map { - patient, sample, gender, status, bam -> //, bai -> + bam_bwamem2_single = bam_bwamem2_to_sort.single.map { + patient, sample, gender, status, bam -> def meta = [:] meta.patient = patient @@ -403,11 +413,11 @@ workflow { meta.status = status[0] meta.id = sample - [meta, bam[0]] // , bai[0]] + [meta, bam[0]] } - bam_multiple = bam.multiple.map { - patient, sample, gender, status, bam -> //, bai -> + bam_bwamem2_multiple = bam_bwamem2_to_sort.multiple.map { + patient, sample, gender, status, bam -> def meta = [:] meta.patient = patient @@ -421,25 +431,76 @@ workflow { // STEP 1.5: MERGING AND INDEXING BAM FROM MULTIPLE LANES - bam_mapped = bam_single.mix(MERGE_BAM_MAPPED(bam_multiple)) - //if(save_bam_mapped || !(params.known_indels)) - //TODO: https://github.com/nf-core/sarek/blob/bce378e09de25bb26c388b917f93f84806d3ba27/main.nf#L1478 - //But if SAMTOOLS_INDEX is not run, markduplicates does not work - bam_mapped = SAMTOOLS_INDEX_MAPPED(bam_mapped) + MERGE_BAM_MAPPED(bam_bwamem2_multiple) + bam_mapped = bam_bwamem2_single.mix(MERGE_BAM_MAPPED.out.bam) + bam_mapped = SAMTOOLS_INDEX_MAPPED(bam_mapped, params.modules['samtools_index_mapped'], +) // STEP 2: MARKING DUPLICATES - markduplicates_report = Channel.empty() - markduplicates_bam = bam_mapped + + report_markduplicates = Channel.empty() + bam_markduplicates = bam_mapped if (!params.skip_markduplicates) { MARKDUPLICATES(bam_mapped) - markduplicates_report = MARKDUPLICATES.out.report - markduplicates_bam = MARKDUPLICATES.out.bam + report_markduplicates = MARKDUPLICATES.out.report + bam_markduplicates = MARKDUPLICATES.out.bam + tsv_markduplicates = MARKDUPLICATES.out.tsv + + // Creating TSV files to restart from this step + tsv_markduplicates.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" + bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" + table = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.recal.table" + ["duplicates_marked_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n"] + } + + tsv_markduplicates.map { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" + bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" + table = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.recal.table" + "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n" + }.collectFile(name: 'duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + } else { + tsv_no_markduplicates = bam_markduplicates.map { meta, bam -> [meta] } + + // Creating TSV files to restart from this step + tsv_no_markduplicates.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.md.bam" + bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.md.bam.bai" + table = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.recal.table" + ["mapped_no_duplicates_marked_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n"] + } + + tsv_no_markduplicates.map { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.md.bam" + bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.md.bam.bai" + table = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.recal.table" + "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n" + }.collectFile(name: 'mapped_no_duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") } // STEP 3: CREATING RECALIBRATION TABLES - bam_baserecalibrator = markduplicates_bam.combine(intervals) + bam_baserecalibrator = bam_markduplicates.combine(intervals) BASERECALIBRATOR(bam_baserecalibrator, dbsnp, dbsnp_tbi, dict, fai, fasta, known_indels, known_indels_tbi) + table_bqsr = BASERECALIBRATOR.out.report + tsv_bqsr = BASERECALIBRATOR.out.tsv // STEP 3.5: MERGING RECALIBRATION TABLES if (!params.no_intervals) { @@ -465,21 +526,45 @@ workflow { } GATHERBQSRREPORTS(recaltable) - table = GATHERBQSRREPORTS.out.table - } else { - table = BASERECALIBRATOR.out.report + table_bqsr = GATHERBQSRREPORTS.out.table + tsv_bqsr = GATHERBQSRREPORTS.out.tsv + + } + + // Creating TSV files to restart from this step + tsv_bqsr.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" + bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" + ["duplicates_marked_no_table_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n"] } + tsv_bqsr.map { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" + bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" + "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n" + }.collectFile(name: 'duplicates_marked_no_table.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + // STEP 4: RECALIBRATING - applybqsr_bam = markduplicates_bam.join(table) + bam_applybqsr = bam_markduplicates.join(table_bqsr) - applybqsr_bam = applybqsr_bam.combine(intervals) - // if (step == 'recalibrate') bamApplyBQSR = input_sample - APPLYBQSR(applybqsr_bam, dict, fasta, fai) + bam_applybqsr = bam_applybqsr.combine(intervals) + + APPLYBQSR(bam_applybqsr, dict, fasta, fai) + + bam_recalibrated = APPLYBQSR.out.bam + tsv_recalibrated = APPLYBQSR.out.tsv // STEP 4.5: MERGING AND INDEXING THE RECALIBRATED BAM FILES if (!params.no_intervals) { - APPLYBQSR.out.map{ meta, bam -> //, bai -> + APPLYBQSR.out.bam.map{ meta, bam -> //, bai -> patient = meta.patient sample = meta.sample gender = meta.gender @@ -501,19 +586,34 @@ workflow { } MERGE_BAM_RECAL(bam_recal_to_merge) - recal = MERGE_BAM_RECAL.out - } else { - recal = APPLYBQSR.out + bam_recalibrated = MERGE_BAM_RECAL.out.bam + tsv_recalibrated = MERGE_BAM_RECAL.out.tsv } - // STEP 5: QC - if (!('samtools' in skip_qc)) { - SAMTOOLS_STATS(BWAMEM2_MEM.out.mix(recal)) + // Creating TSV files to restart from this step + tsv_recalibrated.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/Preprocessing/${sample}/Recalibrated/${sample}.md.bam" + bai = "${params.outdir}/Preprocessing/${sample}/Recalibrated/${sample}.md.bam.bai" + ["recalibrated_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n"] } - if (!('bamqc' in skip_qc)) { - BAMQC(BWAMEM2_MEM.out.mix(recal), target_bed) - } + tsv_recalibrated.map { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/Preprocessing/${sample}/Recalibrated/${sample}.md.bam" + bai = "${params.outdir}/Preprocessing/${sample}/Recalibrated/${sample}.md.bam.bai" + "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n" + }.collectFile(name: 'recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + + // STEP 5: QC + if (!'samtools' in skip_qc) SAMTOOLS_STATS(bam_bwamem2.mix(recal)) + if (!'bamqc' in skip_qc) BAMQC(bam_bwamem2.mix(recal), target_bed) /* ================================================================================ @@ -554,7 +654,7 @@ workflow { QC_TRIM.out.trimgalore_zip.collect().ifEmpty([]), multiqc_config, multiqc_custom_config.ifEmpty([]), - markduplicates_report.collect().ifEmpty([]), + report_markduplicates.collect().ifEmpty([]), workflow_summary) } @@ -570,56 +670,6 @@ workflow.onComplete { Completion.summary(workflow, params, log) } -// /* -// ================================================================================ -// PREPROCESSING -// ================================================================================ -// */ - - -// // STEP 0.5: QC ON READS - -// // TODO: Use only one process for FastQC for FASTQ files and uBAM files -// // FASTQ and uBAM files are renamed based on the sample name - - -// process FastQCBAM { -// label 'FastQC' -// label 'cpus_2' - -// tag "${idPatient}-${idRun}" - -// publishDir "${params.outdir}/Reports/${idSample}/FastQC/${idSample}_${idRun}", mode: params.publish_dir_mode - -// input: -// set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") from input_bam_fastqc - -// output: -// file("*.{html,zip}") into fastQCBAMReport - -// when: !('fastqc' in skip_qc) - -// script: -// """ -// fastqc -t 2 -q ${idSample}_${idRun}.bam -// """ -// } - -// fastQCReport = fastQCFQReport.mix(fastQCBAMReport) - - -// // STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA MEM - -//TODO: needs to be covered when inputBam is supported -// if (params.trim_fastq) input_pair_reads = outputPairReadsTrimGalore //this is covered -// else input_pair_reads = input_pair_reads.mix(input_bam) - -// // STEP 1.5: MERGING BAM FROM MULTIPLE LANES - - -// if (!save_bam_mapped) tsv_bam_indexed.close() - - // // Creating a TSV file to restart from this step // tsv_bam_indexed.map { idPatient, idSample -> // gender = gender_map[idPatient] @@ -640,169 +690,6 @@ workflow.onComplete { // ["mapped_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] // } - -// // STEP 2: MARKING DUPLICATES - - -// // Creating a TSV file to restart from this step -// tsv_bam_duplicates_marked.map { idPatient, idSample -> -// gender = gender_map[idPatient] -// status = status_map[idPatient, idSample] -// bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" -// bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" -// "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" -// }.collectFile( -// name: 'duplicates_marked_no_table.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" -// ) - -// tsv_bam_duplicates_marked_sample -// .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> -// status = status_map[idPatient, idSample] -// gender = gender_map[idPatient] -// bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" -// bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" -// ["duplicates_marked_no_table_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] -// } - -// if ('markduplicates' in skip_qc) duplicates_marked_report.close() - -// if (step == 'preparerecalibration') bam_duplicates_marked = input_sample - - -// // STEP 3: CREATING RECALIBRATION TABLES - - -// if (params.no_intervals) { -// (tableGatherBQSRReports, tableGatherBQSRReportsNoInt) = tableGatherBQSRReports.into(2) -// recalTable = tableGatherBQSRReportsNoInt -// } else recalTableTSVnoInt.close() - -// // STEP 3.5: MERGING RECALIBRATION TABLES - - -// if ('baserecalibrator' in skip_qc) baseRecalibratorReport.close() - - -// (recalTableTSV, recalTableSampleTSV) = recalTableTSV.mix(recalTableTSVnoInt).into(2) - -// // Create TSV files to restart from this step -// if (params.skip_markduplicates) { -// recalTableTSV.map { idPatient, idSample -> -// status = status_map[idPatient, idSample] -// gender = gender_map[idPatient] -// bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" -// bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" -// recalTable = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.recal.table" -// "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n" -// }.collectFile( -// name: 'mapped_no_duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" -// ) - -// recalTableSampleTSV -// .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV/") { -// idPatient, idSample -> -// status = status_map[idPatient, idSample] -// gender = gender_map[idPatient] -// bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" -// bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" -// recalTable = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.recal.table" -// ["mapped_no_duplicates_marked_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n"] -// } -// } else { -// recalTableTSV.map { idPatient, idSample -> -// status = status_map[idPatient, idSample] -// gender = gender_map[idPatient] -// bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" -// bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" -// recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.recal.table" - -// "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n" -// }.collectFile( -// name: 'duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" -// ) - -// recalTableSampleTSV -// .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV/") { -// idPatient, idSample -> -// status = status_map[idPatient, idSample] -// gender = gender_map[idPatient] -// bam = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam" -// bai = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.md.bam.bai" -// recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicatesMarked/${idSample}.recal.table" -// ["duplicates_marked_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n"] -// } -// } - - -// // STEP 4: RECALIBRATING - -// process ApplyBQSR { - - -// (bam_recalibrated_to_merge, bam_recalibrated_to_index) = bam_recalibrated_to_merge.groupTuple(by:[0, 1]).into(2) - - -// // STEP 4.5: MERGING THE RECALIBRATED BAM FILES - -// process MergeBamRecal { -// label 'cpus_8' - -// tag "${idPatient}-${idSample}" - -// publishDir "${params.outdir}/Preprocessing/${idSample}/Recalibrated", mode: params.publish_dir_mode - -// input: -// set idPatient, idSample, file(bam) from bam_recalibrated_to_merge - -// output: -// set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bam_recalibrated -// set idPatient, idSample, file("${idSample}.recal.bam") into bam_recalibrated_qc -// set idPatient, idSample into tsv_bam_recalibrated - -// when: !(params.no_intervals) - -// script: -// """ -// samtools merge --threads ${task.cpus} ${idSample}.recal.bam ${bam} -// samtools index ${idSample}.recal.bam -// """ -// } - -// // STEP 4.5': INDEXING THE RECALIBRATED BAM FILES - - - -// bam_recalibrated = bam_recalibrated.mix(bam_recalibrated_indexed) -// bam_recalibrated_qc = bam_recalibrated_qc.mix(bam_recalibrated_no_int_qc) -// tsv_bam_recalibrated = tsv_bam_recalibrated.mix(tsv_bam_recalibrated_no_int) - -// (bam_recalibrated_bamqc, bam_recalibrated_samtools_stats) = bam_recalibrated_qc.into(2) -// (tsv_bam_recalibrated, tsv_bam_recalibrated_sample) = tsv_bam_recalibrated.into(2) - -// // Creating a TSV file to restart from this step -// tsv_bam_recalibrated.map { idPatient, idSample -> -// gender = gender_map[idPatient] -// status = status_map[idPatient, idSample] -// bam = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam" -// bai = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam.bai" -// "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" -// }.collectFile( -// name: 'recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" -// ) - -// tsv_bam_recalibrated_sample -// .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { -// idPatient, idSample -> -// status = status_map[idPatient, idSample] -// gender = gender_map[idPatient] -// bam = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam" -// bai = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam.bai" -// ["recalibrated_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] -// } - -// // STEP 5: QC - - // /* // ================================================================================ // GERMLINE VARIANT CALLING From 20dfe07a72760a884f1680627e2cd09734d46aef Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 5 Aug 2020 14:52:06 +0200 Subject: [PATCH 105/200] code polishing --- modules/local/process/merge_bam.nf | 3 ++- modules/nf-core/software/fastqc.nf | 2 +- modules/nf-core/software/gatk_applybqsr.nf | 4 +++- modules/nf-core/software/samtools_index.nf | 18 +++++++++++------- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/modules/local/process/merge_bam.nf b/modules/local/process/merge_bam.nf index 623d135e34..3056e12574 100644 --- a/modules/local/process/merge_bam.nf +++ b/modules/local/process/merge_bam.nf @@ -8,7 +8,8 @@ process MERGE_BAM { tuple val(meta), path(bam) output: - tuple val(meta), path("${meta.sample}.bam") + tuple val(meta), path("${meta.sample}.bam"), emit: bam + val meta, emit: tsv script: """ diff --git a/modules/nf-core/software/fastqc.nf b/modules/nf-core/software/fastqc.nf index 2f1fe732be..009710d53a 100644 --- a/modules/nf-core/software/fastqc.nf +++ b/modules/nf-core/software/fastqc.nf @@ -3,7 +3,7 @@ process FASTQC { label 'process_medium' label 'cpus_2' - publishDir "${params.outdir}/${options.publish_dir}", + publishDir "${params.outdir}/${options.publish_dir}/${meta.sample}/${meta.id}", mode: params.publish_dir_mode, saveAs: { filename -> if (options.publish_results == "none") null diff --git a/modules/nf-core/software/gatk_applybqsr.nf b/modules/nf-core/software/gatk_applybqsr.nf index cd267e1e78..c6fe890f6e 100644 --- a/modules/nf-core/software/gatk_applybqsr.nf +++ b/modules/nf-core/software/gatk_applybqsr.nf @@ -11,7 +11,9 @@ process GATK_APPLYBQSR { path fai output: - tuple val(meta), path("${prefix}${meta.sample}.recal.bam") + tuple val(meta), path("${prefix}${meta.sample}.recal.bam") , emit: bam + val meta, emit: tsv + script: prefix = params.no_intervals ? "" : "${interval.baseName}_" diff --git a/modules/nf-core/software/samtools_index.nf b/modules/nf-core/software/samtools_index.nf index 25372a1063..23ef860327 100644 --- a/modules/nf-core/software/samtools_index.nf +++ b/modules/nf-core/software/samtools_index.nf @@ -3,20 +3,24 @@ process SAMTOOLS_INDEX { tag "${meta.id}" -// publishDir params.outdir, mode: params.publish_dir_mode, -// saveAs: { -// if (save_bam_mapped) "Preprocessing/${idSample}/Mapped/${it}" -// else null -// } + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { + if (save_bam_mapped) "Preprocessing/${meta.sample}/Mapped/${it}" + else null + } input: tuple val(meta), path(bam) + val options output: - tuple val(meta), path(bam), path("*.bai") + tuple val(meta), path("${prefix}.bam"), path("*.bai") script: + prefix = options.suffix ? "${meta.id}.${options.suffix}" : "${meta.id}" """ - samtools index ${bam} + [ ! -f ${prefix}.bam ] && ln -s ${bam} ${prefix}.bam + + samtools index ${prefix}.bam """ } \ No newline at end of file From 25661e04d0931f3d33b82c07a6517506c5b1d5a4 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 5 Aug 2020 14:56:04 +0200 Subject: [PATCH 106/200] file -> path --- modules/nf-core/software/gatk_applybqsr.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/nf-core/software/gatk_applybqsr.nf b/modules/nf-core/software/gatk_applybqsr.nf index c6fe890f6e..6c5a47ed9d 100644 --- a/modules/nf-core/software/gatk_applybqsr.nf +++ b/modules/nf-core/software/gatk_applybqsr.nf @@ -5,7 +5,7 @@ process GATK_APPLYBQSR { tag "${meta.id}-${interval.baseName}" input: - tuple val(meta), path(bam), path(bai), path(recalibrationReport), file(interval) + tuple val(meta), path(bam), path(bai), path(recalibrationReport), path(interval) path dict path fasta path fai From 6d5669ec9a21d0a1046cdabd82525d3dec82a705 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Thu, 6 Aug 2020 10:14:44 +0200 Subject: [PATCH 107/200] Add haplotyoe caller --- .../nf-core/software/gatk_haplotypecaller.nf | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 modules/nf-core/software/gatk_haplotypecaller.nf diff --git a/modules/nf-core/software/gatk_haplotypecaller.nf b/modules/nf-core/software/gatk_haplotypecaller.nf new file mode 100644 index 0000000000..20d9e31f63 --- /dev/null +++ b/modules/nf-core/software/gatk_haplotypecaller.nf @@ -0,0 +1,34 @@ +process GATK_HAPLOTYPECALLER { + label 'MEMORY_SINGLECPU_TASK_SQ' + label 'CPUS_2' + + tag "${meta.id}-${interval.baseName}" + + input: + tuple val(meta), path(bam), path(bai), file(interval) + path dbsnp + path dbsnpIndex + path dict + path fasta + path fai + + output: + tuple val("HaplotypeCallerGVCF"), val(meta), path("${interval.baseName}_${meta.id}.g.vcf") emit: gvcfHaplotypeCaller + tuple val(meta), path(interval), path("${intervalBed.baseName}_${meta.id}.g.vcf") emit: gvcfGenotypeGVCFs + + + + script: + intervalsOptions = params.no_intervals ? "" : "-L ${interval}" + dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : "" + """ + gatk --java-options "-Xmx${task.memory.toGiga()}g -Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ + HaplotypeCaller \ + -R ${fasta} \ + -I ${bam} \ + ${intervalsOptions} \ + ${dbsnpOptions} \ + -O ${interval.baseName}_${meta.id}.g.vcf \ + -ERC GVCF + """ +} From 07acd421c29b9652ed18ab36580521360c6e8b8b Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Thu, 6 Aug 2020 13:17:14 +0200 Subject: [PATCH 108/200] Make haplotyper work --- main.nf | 74 +++++++------------ .../nf-core/software/gatk_haplotypecaller.nf | 4 +- 2 files changed, 27 insertions(+), 51 deletions(-) diff --git a/main.nf b/main.nf index 1a08a9d127..6c65e5d157 100644 --- a/main.nf +++ b/main.nf @@ -288,6 +288,7 @@ include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MAPPED } from './modules/nf-c include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_RECAL } from './modules/nf-core/software/samtools_index' include { SAMTOOLS_STATS as SAMTOOLS_STATS } from './modules/nf-core/software/samtools_stats' include { QUALIMAP_BAMQC as BAMQC } from './modules/nf-core/software/qualimap_bamqc' +include { GATK_HAPLOTYPECALLER as HAPLOTYPECALLER } from './modules/nf-core/software/gatk_haplotypecaller' include { MULTIQC } from './modules/nf-core/software/multiqc' /* @@ -557,6 +558,7 @@ workflow { bam_applybqsr = bam_applybqsr.combine(intervals) + bam_applybqsr.dump() APPLYBQSR(bam_applybqsr, dict, fasta, fai) bam_recalibrated = APPLYBQSR.out.bam @@ -589,7 +591,15 @@ workflow { bam_recalibrated = MERGE_BAM_RECAL.out.bam tsv_recalibrated = MERGE_BAM_RECAL.out.tsv } + //TODO: set bam_recalibrated with all these steps + // // When using sentieon for mapping, Channel bam_recalibrated is bam_sentieon_recal +// if (params.sentieon && step == 'mapping') bam_recalibrated = bam_sentieon_recal + +// // When no knownIndels for mapping, Channel bam_recalibrated is bam_duplicates_marked +// if (!params.known_indels && step == 'mapping') bam_recalibrated = bam_duplicates_marked +// // When starting with variant calling, Channel bam_recalibrated is input_sample +// if (step == 'variantcalling') bam_recalibrated = input_sample // Creating TSV files to restart from this step tsv_recalibrated.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> patient = meta.patient @@ -620,6 +630,21 @@ workflow { GERMLINE VARIANT CALLING ================================================================================ */ + + bam_recalibrated_indexed = SAMTOOLS_INDEX_RECAL(bam_recalibrated, params.modules['samtools_index_mapped'],) + bam_haplotypecaller = bam_recalibrated_indexed.combine(intervals) + if ('haplotypecaller' in tools) + HAPLOTYPECALLER(bam_haplotypecaller, dbsnp, + dbsnp_tbi, + dict, + fasta, + fai) + + // gvcfHaplotypeCaller = gvcfHaplotypeCaller.groupTuple(by:[0, 1, 2]) + +// if (params.no_gvcf) gvcfHaplotypeCaller.close() +// else gvcfHaplotypeCaller = gvcfHaplotypeCaller.dump(tag:'GVCF HaplotypeCaller') + /* ================================================================================ @@ -696,16 +721,8 @@ workflow.onComplete { // ================================================================================ // */ -// // When using sentieon for mapping, Channel bam_recalibrated is bam_sentieon_recal -// if (params.sentieon && step == 'mapping') bam_recalibrated = bam_sentieon_recal -// // When no knownIndels for mapping, Channel bam_recalibrated is bam_duplicates_marked -// if (!params.known_indels && step == 'mapping') bam_recalibrated = bam_duplicates_marked - -// // When starting with variant calling, Channel bam_recalibrated is input_sample -// if (step == 'variantcalling') bam_recalibrated = input_sample -// bam_recalibrated = bam_recalibrated.dump(tag:'BAM for Variant Calling') // // Here we have a recalibrated bam set // // The TSV file is formatted like: "idPatient status idSample bamFile baiFile" @@ -714,55 +731,14 @@ workflow.onComplete { // (bamMantaSingle, bamStrelkaSingle, bamTIDDIT, bamFreebayesSingleNoIntervals, bamHaplotypeCallerNoIntervals, bamRecalAll) = bam_recalibrated.into(6) -// (bam_sentieon_DNAseq, bam_sentieon_DNAscope, bam_sentieon_all) = bam_sentieon_deduped_table.into(3) // // To speed Variant Callers up we are chopping the reference into smaller pieces // // Do variant calling by this intervals, and re-merge the VCFs - -// bamHaplotypeCaller = bamHaplotypeCallerNoIntervals.spread(intHaplotypeCaller) // bamFreebayesSingle = bamFreebayesSingleNoIntervals.spread(intFreebayesSingle) -// // STEP GATK HAPLOTYPECALLER.1 - -// process HaplotypeCaller { -// label 'memory_singleCPU_task_sq' -// label 'cpus_2' - -// tag "${idSample}-${intervalBed.baseName}" - -// input: -// set idPatient, idSample, file(bam), file(bai), file(intervalBed) from bamHaplotypeCaller -// file(dbsnp) from dbsnp -// file(dbsnpIndex) from dbsnp_tbi -// file(dict) from dict -// file(fasta) from fasta -// file(fastaFai) from fai -// output: -// set val("HaplotypeCallerGVCF"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.g.vcf") into gvcfHaplotypeCaller -// set idPatient, idSample, file(intervalBed), file("${intervalBed.baseName}_${idSample}.g.vcf") into gvcfGenotypeGVCFs -// when: 'haplotypecaller' in tools -// script: -// intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" -// dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : "" -// """ -// gatk --java-options "-Xmx${task.memory.toGiga()}g -Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ -// HaplotypeCaller \ -// -R ${fasta} \ -// -I ${bam} \ -// ${intervalsOptions} \ -// ${dbsnpOptions} \ -// -O ${intervalBed.baseName}_${idSample}.g.vcf \ -// -ERC GVCF -// """ -// } - -// gvcfHaplotypeCaller = gvcfHaplotypeCaller.groupTuple(by:[0, 1, 2]) - -// if (params.no_gvcf) gvcfHaplotypeCaller.close() -// else gvcfHaplotypeCaller = gvcfHaplotypeCaller.dump(tag:'GVCF HaplotypeCaller') // // STEP GATK HAPLOTYPECALLER.2 diff --git a/modules/nf-core/software/gatk_haplotypecaller.nf b/modules/nf-core/software/gatk_haplotypecaller.nf index 20d9e31f63..89e11c5397 100644 --- a/modules/nf-core/software/gatk_haplotypecaller.nf +++ b/modules/nf-core/software/gatk_haplotypecaller.nf @@ -13,8 +13,8 @@ process GATK_HAPLOTYPECALLER { path fai output: - tuple val("HaplotypeCallerGVCF"), val(meta), path("${interval.baseName}_${meta.id}.g.vcf") emit: gvcfHaplotypeCaller - tuple val(meta), path(interval), path("${intervalBed.baseName}_${meta.id}.g.vcf") emit: gvcfGenotypeGVCFs + tuple val("HaplotypeCallerGVCF"), val(meta), path("${interval.baseName}_${meta.id}.g.vcf"), emit: gvcfHaplotypeCaller + tuple val(meta), path(interval), path("${interval.baseName}_${meta.id}.g.vcf"), emit: gvcfGenotypeGVCFs From 8c11610de82fdf02eeabe87a3d77cd94ace8f46c Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Thu, 6 Aug 2020 14:07:30 +0200 Subject: [PATCH 109/200] Add gvcfcaller --- main.nf | 150 ++---------------- modules/nf-core/software/gatk_genotypegvcf.nf | 32 ++++ 2 files changed, 47 insertions(+), 135 deletions(-) create mode 100644 modules/nf-core/software/gatk_genotypegvcf.nf diff --git a/main.nf b/main.nf index 6c65e5d157..49d47b4e6e 100644 --- a/main.nf +++ b/main.nf @@ -289,6 +289,7 @@ include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_RECAL } from './modules/nf-c include { SAMTOOLS_STATS as SAMTOOLS_STATS } from './modules/nf-core/software/samtools_stats' include { QUALIMAP_BAMQC as BAMQC } from './modules/nf-core/software/qualimap_bamqc' include { GATK_HAPLOTYPECALLER as HAPLOTYPECALLER } from './modules/nf-core/software/gatk_haplotypecaller' +include { GATK_GENOTYPEVCF as GENOTYPEVCF } from './modules/nf-core/software/gatk_genotypegvcf' include { MULTIQC } from './modules/nf-core/software/multiqc' /* @@ -558,7 +559,6 @@ workflow { bam_applybqsr = bam_applybqsr.combine(intervals) - bam_applybqsr.dump() APPLYBQSR(bam_applybqsr, dict, fasta, fai) bam_recalibrated = APPLYBQSR.out.bam @@ -630,22 +630,30 @@ workflow { GERMLINE VARIANT CALLING ================================================================================ */ - + //TODO double check whether the indexing has to be repeated here. there is a bai file somewhere up at ApplyBQSR bam_recalibrated_indexed = SAMTOOLS_INDEX_RECAL(bam_recalibrated, params.modules['samtools_index_mapped'],) bam_haplotypecaller = bam_recalibrated_indexed.combine(intervals) - if ('haplotypecaller' in tools) + if ('haplotypecaller' in tools){ + // STEP GATK HAPLOTYPECALLER.1 + HAPLOTYPECALLER(bam_haplotypecaller, dbsnp, dbsnp_tbi, dict, fasta, fai) - // gvcfHaplotypeCaller = gvcfHaplotypeCaller.groupTuple(by:[0, 1, 2]) - -// if (params.no_gvcf) gvcfHaplotypeCaller.close() -// else gvcfHaplotypeCaller = gvcfHaplotypeCaller.dump(tag:'GVCF HaplotypeCaller') + + // STEP GATK HAPLOTYPECALLER.2 + GENOTYPEVCF(HAPLOTYPECALLER.out.gvcfGenotypeGVCFs, dbsnp, + dbsnp_tbi, + dict, + fasta, + fai) + // vcfGenotypeGVCFs = vcfGenotypeGVCFs.groupTuple(by:[0, 1, 2]) + } + /* ================================================================================ SOMATIC VARIANT CALLING @@ -740,136 +748,8 @@ workflow.onComplete { -// // STEP GATK HAPLOTYPECALLER.2 - -// process GenotypeGVCFs { -// tag "${idSample}-${intervalBed.baseName}" - -// input: -// set idPatient, idSample, file(intervalBed), file(gvcf) from gvcfGenotypeGVCFs -// file(dbsnp) from dbsnp -// file(dbsnpIndex) from dbsnp_tbi -// file(dict) from dict -// file(fasta) from fasta -// file(fastaFai) from fai - -// output: -// set val("HaplotypeCaller"), idPatient, idSample, file("${intervalBed.baseName}_${idSample}.vcf") into vcfGenotypeGVCFs - -// when: 'haplotypecaller' in tools - -// script: -// // Using -L is important for speed and we have to index the interval files also -// intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}" -// dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : "" -// """ -// gatk --java-options -Xmx${task.memory.toGiga()}g \ -// IndexFeatureFile \ -// -I ${gvcf} - -// gatk --java-options -Xmx${task.memory.toGiga()}g \ -// GenotypeGVCFs \ -// -R ${fasta} \ -// ${intervalsOptions} \ -// ${dbsnpOptions} \ -// -V ${gvcf} \ -// -O ${intervalBed.baseName}_${idSample}.vcf -// """ -// } - -// vcfGenotypeGVCFs = vcfGenotypeGVCFs.groupTuple(by:[0, 1, 2]) - -// // STEP SENTIEON DNAseq - -// process Sentieon_DNAseq { -// label 'cpus_max' -// label 'memory_max' -// label 'sentieon' - -// tag "${idSample}" - -// input: -// set idPatient, idSample, file(bam), file(bai), file(recal) from bam_sentieon_DNAseq -// file(dbsnp) from dbsnp -// file(dbsnpIndex) from dbsnp_tbi -// file(fasta) from fasta -// file(fastaFai) from fai -// output: -// set val("SentieonDNAseq"), idPatient, idSample, file("DNAseq_${idSample}.vcf") into vcf_sentieon_DNAseq - -// when: 'dnaseq' in tools && params.sentieon - -// script: -// """ -// sentieon driver \ -// -t ${task.cpus} \ -// -r ${fasta} \ -// -i ${bam} \ -// -q ${recal} \ -// --algo Haplotyper \ -// -d ${dbsnp} \ -// DNAseq_${idSample}.vcf -// """ -// } - -// vcf_sentieon_DNAseq = vcf_sentieon_DNAseq.dump(tag:'sentieon DNAseq') - -// // STEP SENTIEON DNAscope - -// process Sentieon_DNAscope { -// label 'cpus_max' -// label 'memory_max' -// label 'sentieon' - -// tag "${idSample}" - -// input: -// set idPatient, idSample, file(bam), file(bai), file(recal) from bam_sentieon_DNAscope -// file(dbsnp) from dbsnp -// file(dbsnpIndex) from dbsnp_tbi -// file(fasta) from fasta -// file(fastaFai) from fai - -// output: -// set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_${idSample}.vcf") into vcf_sentieon_DNAscope -// set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_SV_${idSample}.vcf") into vcf_sentieon_DNAscope_SV - -// when: 'dnascope' in tools && params.sentieon - -// script: -// """ -// sentieon driver \ -// -t ${task.cpus} \ -// -r ${fasta} \ -// -i ${bam} \ -// -q ${recal} \ -// --algo DNAscope \ -// -d ${dbsnp} \ -// DNAscope_${idSample}.vcf - -// sentieon driver \ -// -t ${task.cpus} \ -// -r ${fasta}\ -// -i ${bam} \ -// -q ${recal} \ -// --algo DNAscope \ -// --var_type bnd \ -// -d ${dbsnp} \ -// DNAscope_${idSample}.temp.vcf - -// sentieon driver \ -// -t ${task.cpus} \ -// -r ${fasta}\ -// -q ${recal} \ -// --algo SVSolver \ -// -v DNAscope_${idSample}.temp.vcf \ -// DNAscope_SV_${idSample}.vcf -// """ -// } -// vcf_sentieon_DNAscope = vcf_sentieon_DNAscope.dump(tag:'sentieon DNAscope') -// vcf_sentieon_DNAscope_SV = vcf_sentieon_DNAscope_SV.dump(tag:'sentieon DNAscope SV') // // STEP STRELKA.1 - SINGLE MODE diff --git a/modules/nf-core/software/gatk_genotypegvcf.nf b/modules/nf-core/software/gatk_genotypegvcf.nf new file mode 100644 index 0000000000..38a139966c --- /dev/null +++ b/modules/nf-core/software/gatk_genotypegvcf.nf @@ -0,0 +1,32 @@ +process GATK_GENOTYPEVCF { + tag "${meta.id}-${interval.baseName}" + + input: + tuple val(meta), path(interval), path(gvcf) + path dbsnp + path dbsnpIndex + path dict + path fasta + path fai + + output: + tuple val("HaplotypeCaller"), val(meta), path("${interval.baseName}_${meta.id}.vcf") + + script: + // Using -L is important for speed and we have to index the interval files also + intervalsOptions = params.no_intervals ? "" : "-L ${interval}" + dbsnpOptions = params.dbsnp ? "--D ${dbsnp}" : "" + """ + gatk --java-options -Xmx${task.memory.toGiga()}g \ + IndexFeatureFile \ + -I ${gvcf} + + gatk --java-options -Xmx${task.memory.toGiga()}g \ + GenotypeGVCFs \ + -R ${fasta} \ + ${intervalsOptions} \ + ${dbsnpOptions} \ + -V ${gvcf} \ + -O ${interval.baseName}_${meta.id}.vcf + """ +} \ No newline at end of file From 39ac708ec01b00a9f2f3bf75b3600896dae68dc0 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Thu, 6 Aug 2020 14:13:45 +0200 Subject: [PATCH 110/200] Add test profile for germline variant calling --- conf/test_germline_variantcalling.config | 15 +++++++++++++++ nextflow.config | 2 ++ 2 files changed, 17 insertions(+) create mode 100644 conf/test_germline_variantcalling.config diff --git a/conf/test_germline_variantcalling.config b/conf/test_germline_variantcalling.config new file mode 100644 index 0000000000..7cec01feca --- /dev/null +++ b/conf/test_germline_variantcalling.config @@ -0,0 +1,15 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/sarek -profile test_tool + */ + +includeConfig 'test.config' + +params { + // Input data + tools = 'haplotypecaller,strelka' +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 6193471aa4..ce60594cf7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -165,6 +165,8 @@ profiles { test_targeted { includeConfig 'conf/test_targeted.config' } test_tool { includeConfig 'conf/test_tool.config' } test_trimming { includeConfig 'conf/test_trimming.config' } + test_haplotypecaller { includeConfig 'conf/test_germline_variantcalling.config' } + } // Load genomes.config or igenomes.config From 5ca536aa105b123c17af61faf2eb0fd27a422c1b Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Thu, 6 Aug 2020 17:18:40 +0200 Subject: [PATCH 111/200] Start adding strelka with new template --- main.nf | 10 +-- modules/nf-core/software/functions.nf | 59 +++++++++++++++++ .../nf-core/software/gatk_haplotypecaller.nf | 1 - modules/nf-core/software/strelka.nf | 63 +++++++++++++++++++ 4 files changed, 127 insertions(+), 6 deletions(-) create mode 100644 modules/nf-core/software/functions.nf create mode 100644 modules/nf-core/software/strelka.nf diff --git a/main.nf b/main.nf index 49d47b4e6e..45eddbcfbd 100644 --- a/main.nf +++ b/main.nf @@ -593,13 +593,13 @@ workflow { } //TODO: set bam_recalibrated with all these steps // // When using sentieon for mapping, Channel bam_recalibrated is bam_sentieon_recal -// if (params.sentieon && step == 'mapping') bam_recalibrated = bam_sentieon_recal + // if (params.sentieon && step == 'mapping') bam_recalibrated = bam_sentieon_recal -// // When no knownIndels for mapping, Channel bam_recalibrated is bam_duplicates_marked -// if (!params.known_indels && step == 'mapping') bam_recalibrated = bam_duplicates_marked + // // When no knownIndels for mapping, Channel bam_recalibrated is bam_duplicates_marked + // if (!params.known_indels && step == 'mapping') bam_recalibrated = bam_duplicates_marked -// // When starting with variant calling, Channel bam_recalibrated is input_sample -// if (step == 'variantcalling') bam_recalibrated = input_sample + // // When starting with variant calling, Channel bam_recalibrated is input_sample + // if (step == 'variantcalling') bam_recalibrated = input_sample // Creating TSV files to restart from this step tsv_recalibrated.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> patient = meta.patient diff --git a/modules/nf-core/software/functions.nf b/modules/nf-core/software/functions.nf new file mode 100644 index 0000000000..c284945c4b --- /dev/null +++ b/modules/nf-core/software/functions.nf @@ -0,0 +1,59 @@ +/* + * ----------------------------------------------------- + * Utility functions used in nf-core DSL2 module files + * ----------------------------------------------------- + */ + +/* + * Extract name of software tool from process name using $task.process + */ +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +/* + * Function to initialise default values and to generate a Groovy Map of available options for nf-core modules + */ +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.publish_by_id = args.publish_by_id ?: false + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files ?: null + options.suffix = args.suffix ?: '' + return options +} + +/* + * Tidy up and join elements of a list to return a path string + */ +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +/* + * Function to save/publish module results + */ +def saveFiles(Map args) { + if (!args.filename.endsWith('.version.txt')) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + if (ioptions.publish_by_id) { + path_list.add(args.publish_id) + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" + } + } + } else { + return "${getPathFromList(path_list)}/$args.filename" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/software/gatk_haplotypecaller.nf b/modules/nf-core/software/gatk_haplotypecaller.nf index 89e11c5397..6971224abb 100644 --- a/modules/nf-core/software/gatk_haplotypecaller.nf +++ b/modules/nf-core/software/gatk_haplotypecaller.nf @@ -16,7 +16,6 @@ process GATK_HAPLOTYPECALLER { tuple val("HaplotypeCallerGVCF"), val(meta), path("${interval.baseName}_${meta.id}.g.vcf"), emit: gvcfHaplotypeCaller tuple val(meta), path(interval), path("${interval.baseName}_${meta.id}.g.vcf"), emit: gvcfGenotypeGVCFs - script: intervalsOptions = params.no_intervals ? "" : "-L ${interval}" diff --git a/modules/nf-core/software/strelka.nf b/modules/nf-core/software/strelka.nf new file mode 100644 index 0000000000..94f4be0520 --- /dev/null +++ b/modules/nf-core/software/strelka.nf @@ -0,0 +1,63 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +process STRELKA { + tag "$meta.id" + + label 'cpus_max' + label 'memory_max' + + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + // TODO nf-core: If a meta map of sample information is NOT provided in "input:" section + // change "publish_id:meta.id" to initialise an empty string e.g. "publish_id:''". + saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + // TODO nf-core: Fetch "docker pull" address for latest Biocontainer image of software: e.g. https://biocontainers.pro/#/tools/samtools + // If required, multi-tool containers may also be available and are usually named to start with "mulled". + container "quay.io/biocontainers/strelka:2.9.10--0" + + // TODO nf-core: List required Conda packages. + // Software MUST be pinned to channel (i.e. "bioconda") and version (i.e. "1.10") as in the example below. + // Pinning the build too e.g. "bioconda::samtools=1.10=h9402c20_2" is not currently a requirement. + conda (params.conda ? "bioconda::strelka=2.9.10" : null) + + input: + // TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group" + // MUST be provided as an input via a Groovy Map called "meta". + // This information may not be required in some instances e.g. indexing reference genome files: + // https://github.com/nf-core/modules/blob/master/software/bwa/index/main.nf + // TODO nf-core: Where applicable please provide/convert compressed files as input/output + // e.g. "*.fastq.gz" and NOT "*.fastq", "*.bam" and NOT "*.sam" etc. + tuple val(meta), path(reads) + // TODO nf-core: List additional required input channels/values here + val options + + output: + // TODO nf-core: Named file extensions MUST be emitted for ALL output channels + // TODO nf-core: If meta is provided in "input:" section then it MUST be added to ALL output channels (except version) + tuple val(meta), path("*.bam"), emit: bam + // TODO nf-core: List additional required output channels/values here + path "*.version.txt", emit: version + + script: + def software = getSoftwareName(task.process) + def ioptions = initOptions(options) + // TODO nf-core: If a meta map of sample information is NOT provided in "input:" section delete the line below + def prefix = ioptions.suffix ? "${meta.id}${ioptions.suffix}" : "${meta.id}" + // TODO nf-core: Where possible, a command MUST be provided to obtain the version number of the software e.g. 1.10 + // If the software is unable to output a version number on the command-line then it can be manually specified + // e.g. https://github.com/nf-core/modules/blob/master/software/homer/annotatepeaks/main.nf + // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "$ioptions.args" variable + // TODO nf-core: If the tool supports multi-threading then you MUST provide the appropriate parameter + // using the Nextflow "task" variable e.g. "--threads $task.cpus" + // TODO nf-core: Please indent the command appropriately (4 spaces!!) to help with readability ;) + """ + software tool \\ + $ioptions.args \\ + --threads $task.cpus \\ + $reads \\ + > ${prefix}.bam + echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' > ${software}.version.txt + """ +} \ No newline at end of file From 8073ad871b567e1f88fc4d06d7dc207999a97698 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Fri, 7 Aug 2020 14:27:25 +0200 Subject: [PATCH 112/200] Strelka works, except for publishDir --- main.nf | 18 ++++++-- modules/nf-core/software/strelka.nf | 64 ++++++++++++++--------------- 2 files changed, 45 insertions(+), 37 deletions(-) diff --git a/main.nf b/main.nf index 45eddbcfbd..e2c0ff5eca 100644 --- a/main.nf +++ b/main.nf @@ -290,6 +290,7 @@ include { SAMTOOLS_STATS as SAMTOOLS_STATS } from './modules/nf-c include { QUALIMAP_BAMQC as BAMQC } from './modules/nf-core/software/qualimap_bamqc' include { GATK_HAPLOTYPECALLER as HAPLOTYPECALLER } from './modules/nf-core/software/gatk_haplotypecaller' include { GATK_GENOTYPEVCF as GENOTYPEVCF } from './modules/nf-core/software/gatk_genotypegvcf' +include { STRELKA as STRELKA } from './modules/nf-core/software/strelka' include { MULTIQC } from './modules/nf-core/software/multiqc' /* @@ -631,9 +632,10 @@ workflow { ================================================================================ */ //TODO double check whether the indexing has to be repeated here. there is a bai file somewhere up at ApplyBQSR - bam_recalibrated_indexed = SAMTOOLS_INDEX_RECAL(bam_recalibrated, params.modules['samtools_index_mapped'],) - bam_haplotypecaller = bam_recalibrated_indexed.combine(intervals) + bam_recalibrated_indexed_variant_calling = SAMTOOLS_INDEX_RECAL(bam_recalibrated, params.modules['samtools_index_mapped'],) if ('haplotypecaller' in tools){ + bam_haplotypecaller = bam_recalibrated_indexed_variant_calling.combine(intervals) + // STEP GATK HAPLOTYPECALLER.1 HAPLOTYPECALLER(bam_haplotypecaller, dbsnp, @@ -649,10 +651,20 @@ workflow { dict, fasta, fai) - // vcfGenotypeGVCFs = vcfGenotypeGVCFs.groupTuple(by:[0, 1, 2]) + GENOTYPEVCF.out.map{name, meta, vcf -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + [name, patient, sample, gender, status, vcf] + }.groupTuple(by: [0,1,2,]) + .set{ vcfGenotypeGVCFs } } + if ('strelka' in tools) { + STRELKA(bam_recalibrated_indexed_variant_calling, fasta, fai, target_bed, params.modules['strelka']) + } /* ================================================================================ diff --git a/modules/nf-core/software/strelka.nf b/modules/nf-core/software/strelka.nf index 94f4be0520..6b3ee332c8 100644 --- a/modules/nf-core/software/strelka.nf +++ b/modules/nf-core/software/strelka.nf @@ -7,57 +7,53 @@ process STRELKA { label 'cpus_max' label 'memory_max' - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - // TODO nf-core: If a meta map of sample information is NOT provided in "input:" section - // change "publish_id:meta.id" to initialise an empty string e.g. "publish_id:''". - saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + // publishDir "${params.outdir}", + // mode: params.publish_dir_mode, + // saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) + // } - // TODO nf-core: Fetch "docker pull" address for latest Biocontainer image of software: e.g. https://biocontainers.pro/#/tools/samtools - // If required, multi-tool containers may also be available and are usually named to start with "mulled". container "quay.io/biocontainers/strelka:2.9.10--0" - - // TODO nf-core: List required Conda packages. - // Software MUST be pinned to channel (i.e. "bioconda") and version (i.e. "1.10") as in the example below. - // Pinning the build too e.g. "bioconda::samtools=1.10=h9402c20_2" is not currently a requirement. - conda (params.conda ? "bioconda::strelka=2.9.10" : null) + //conda (params.conda ? "bioconda::strelka=2.9.10" : null) input: - // TODO nf-core: Where applicable all sample-specific information e.g. "id", "single_end", "read_group" - // MUST be provided as an input via a Groovy Map called "meta". - // This information may not be required in some instances e.g. indexing reference genome files: - // https://github.com/nf-core/modules/blob/master/software/bwa/index/main.nf - // TODO nf-core: Where applicable please provide/convert compressed files as input/output - // e.g. "*.fastq.gz" and NOT "*.fastq", "*.bam" and NOT "*.sam" etc. - tuple val(meta), path(reads) - // TODO nf-core: List additional required input channels/values here + tuple val(meta), path(bam), path (bai) + path(fasta) + path(fai) + path(target_bed) val options output: - // TODO nf-core: Named file extensions MUST be emitted for ALL output channels - // TODO nf-core: If meta is provided in "input:" section then it MUST be added to ALL output channels (except version) - tuple val(meta), path("*.bam"), emit: bam - // TODO nf-core: List additional required output channels/values here + tuple val("Strelka"), val(meta), path("*.vcf.gz"), path("*.vcf.gz.tbi"), emit: vcfStrelkaSingle path "*.version.txt", emit: version script: def software = getSoftwareName(task.process) def ioptions = initOptions(options) - // TODO nf-core: If a meta map of sample information is NOT provided in "input:" section delete the line below def prefix = ioptions.suffix ? "${meta.id}${ioptions.suffix}" : "${meta.id}" - // TODO nf-core: Where possible, a command MUST be provided to obtain the version number of the software e.g. 1.10 - // If the software is unable to output a version number on the command-line then it can be manually specified - // e.g. https://github.com/nf-core/modules/blob/master/software/homer/annotatepeaks/main.nf // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "$ioptions.args" variable // TODO nf-core: If the tool supports multi-threading then you MUST provide the appropriate parameter // using the Nextflow "task" variable e.g. "--threads $task.cpus" // TODO nf-core: Please indent the command appropriately (4 spaces!!) to help with readability ;) + beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${target_bed} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" + options = params.target_bed ? ioptions.args : "" """ - software tool \\ - $ioptions.args \\ - --threads $task.cpus \\ - $reads \\ - > ${prefix}.bam - echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' > ${software}.version.txt + ${beforeScript} + configureStrelkaGermlineWorkflow.py \ + --bam ${bam} \ + --referenceFasta ${fasta} \ + ${options} \ + --runDir Strelka + python Strelka/runWorkflow.py -m local -j ${task.cpus} + mv Strelka/results/variants/genome.*.vcf.gz \ + Strelka_${meta.sample}_genome.vcf.gz + mv Strelka/results/variants/genome.*.vcf.gz.tbi \ + Strelka_${meta.sample}_genome.vcf.gz.tbi + mv Strelka/results/variants/variants.vcf.gz \ + Strelka_${meta.sample}_variants.vcf.gz + mv Strelka/results/variants/variants.vcf.gz.tbi \ + Strelka_${meta.sample}_variants.vcf.gz.tbi + + echo configureStrelkaGermlineWorkflow.py --version &> ${software}.version.txt #2>&1 || true + #echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' > ${software}.version.txt """ } \ No newline at end of file From d57ede33a2de05b35760ebd96c83a039a45fb750 Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Fri, 14 Aug 2020 11:18:06 +0200 Subject: [PATCH 113/200] Fix publishDir --- modules/nf-core/software/strelka.nf | 38 ++++++++++++++--------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/modules/nf-core/software/strelka.nf b/modules/nf-core/software/strelka.nf index 6b3ee332c8..30b87ef19a 100644 --- a/modules/nf-core/software/strelka.nf +++ b/modules/nf-core/software/strelka.nf @@ -7,10 +7,11 @@ process STRELKA { label 'cpus_max' label 'memory_max' - // publishDir "${params.outdir}", - // mode: params.publish_dir_mode, - // saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) - // } + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + + container "quay.io/biocontainers/strelka:2.9.10--0" //conda (params.conda ? "bioconda::strelka=2.9.10" : null) @@ -29,31 +30,30 @@ process STRELKA { script: def software = getSoftwareName(task.process) def ioptions = initOptions(options) - def prefix = ioptions.suffix ? "${meta.id}${ioptions.suffix}" : "${meta.id}" + def prefix = ioptions.suffix ? "Strelka_${meta.id}" : "Strelka_${meta.id}" // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "$ioptions.args" variable // TODO nf-core: If the tool supports multi-threading then you MUST provide the appropriate parameter // using the Nextflow "task" variable e.g. "--threads $task.cpus" - // TODO nf-core: Please indent the command appropriately (4 spaces!!) to help with readability ;) beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${target_bed} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" - options = params.target_bed ? ioptions.args : "" + options_strelka = params.target_bed ? ioptions.args : "" """ ${beforeScript} configureStrelkaGermlineWorkflow.py \ --bam ${bam} \ --referenceFasta ${fasta} \ - ${options} \ + ${options_strelka} \ --runDir Strelka + python Strelka/runWorkflow.py -m local -j ${task.cpus} - mv Strelka/results/variants/genome.*.vcf.gz \ - Strelka_${meta.sample}_genome.vcf.gz - mv Strelka/results/variants/genome.*.vcf.gz.tbi \ - Strelka_${meta.sample}_genome.vcf.gz.tbi - mv Strelka/results/variants/variants.vcf.gz \ - Strelka_${meta.sample}_variants.vcf.gz - mv Strelka/results/variants/variants.vcf.gz.tbi \ - Strelka_${meta.sample}_variants.vcf.gz.tbi - - echo configureStrelkaGermlineWorkflow.py --version &> ${software}.version.txt #2>&1 || true - #echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' > ${software}.version.txt + + mv Strelka/results/variants/genome.*.vcf.gz ${prefix}_genome.vcf.gz + + mv Strelka/results/variants/genome.*.vcf.gz.tbi ${prefix}_genome.vcf.gz.tbi + + mv Strelka/results/variants/variants.vcf.gz ${prefix}_variants.vcf.gz + + mv Strelka/results/variants/variants.vcf.gz.tbi ${prefix}_variants.vcf.gz.tbi + + echo configureStrelkaGermlineWorkflow.py --version &> ${software}.version.txt #2>&1 """ } \ No newline at end of file From 3c28aa6f1d2c9b9b063bf145e3721a9bfa4059cd Mon Sep 17 00:00:00 2001 From: FriederikeHanssen Date: Fri, 14 Aug 2020 11:21:20 +0200 Subject: [PATCH 114/200] Add conda option --- modules/nf-core/software/strelka.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/nf-core/software/strelka.nf b/modules/nf-core/software/strelka.nf index 30b87ef19a..e8db199108 100644 --- a/modules/nf-core/software/strelka.nf +++ b/modules/nf-core/software/strelka.nf @@ -14,7 +14,8 @@ process STRELKA { container "quay.io/biocontainers/strelka:2.9.10--0" - //conda (params.conda ? "bioconda::strelka=2.9.10" : null) + + conda (params.conda ? "bioconda::strelka=2.9.10" : null) input: tuple val(meta), path(bam), path (bai) From 58d7b85c944f01d4c8de8ce41d5415ddc287b376 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 21 Sep 2020 15:58:13 +0200 Subject: [PATCH 115/200] update docs, SciLifeLab logo, F1000 article --- README.md | 58 +- docs/README.md | 25 +- docs/abstracts/2020-10-VCBS.md | 36 + docs/annotation.md | 90 --- docs/ascat.md | 176 ----- docs/containers.md | 147 ---- docs/images/SciLifeLab_logo.png | Bin 8387 -> 12212 bytes docs/images/SciLifeLab_logo.svg | 178 +++-- docs/input.md | 238 ------ docs/install_bianca.md | 202 ----- docs/output.md | 481 +++++++----- docs/reference.md | 62 -- docs/sentieon.md | 73 -- docs/usage.md | 1213 ++++++++++++++++++++++++------- docs/use_cases.md | 125 ---- docs/variant_calling.md | 57 -- 16 files changed, 1421 insertions(+), 1740 deletions(-) create mode 100644 docs/abstracts/2020-10-VCBS.md delete mode 100644 docs/annotation.md delete mode 100644 docs/ascat.md delete mode 100644 docs/containers.md delete mode 100644 docs/input.md delete mode 100644 docs/install_bianca.md delete mode 100644 docs/reference.md delete mode 100644 docs/sentieon.md delete mode 100644 docs/use_cases.md delete mode 100644 docs/variant_calling.md diff --git a/README.md b/README.md index 29fab9a95d..9d486710a5 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ > **An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing** -[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A520.07.0--RC1-brightgreen.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A519.10.0-brightgreen.svg)](https://www.nextflow.io/) [![nf-core](https://img.shields.io/badge/nf--core-pipeline-brightgreen.svg)](https://nf-co.re/) [![DOI](https://zenodo.org/badge/184289291.svg)](https://zenodo.org/badge/latestdoi/184289291) @@ -10,11 +10,9 @@ [![GitHub Actions Linting status](https://github.com/nf-core/sarek/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/sarek/actions?query=workflow%3A%22nf-core+linting%22) [![CircleCi build status](https://img.shields.io/circleci/project/github/nf-core/sarek?logo=circleci)](https://circleci.com/gh/nf-core/sarek/) -[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](http://bioconda.github.io/) -[![Docker Container available](https://img.shields.io/docker/automated/nfcore/sarek.svg)](https://hub.docker.com/r/nfcore/sarek/) -[![Install with Singularity](https://img.shields.io/badge/use%20with-singularity-purple.svg)](https://www.sylabs.io/docs/) - -[![Join us on Slack](https://img.shields.io/badge/slack-nfcore/sarek-blue.svg)](https://nfcore.slack.com/channels/sarek) +[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](https://bioconda.github.io/) +[![Docker](https://img.shields.io/docker/automated/nfcore/sarek.svg)](https://hub.docker.com/r/nfcore/sarek) +[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23sarek-4A154B?logo=slack)](https://nfcore.slack.com/channels/sarek) ## Introduction @@ -33,49 +31,31 @@ It's listed on [Elixir - Tools and Data Services Registry](https://bio.tools/Sar ## Quick Start -i. Install [`Nextflow`](https://nf-co.re/usage/installation) +1. Install [`Nextflow`](https://nf-co.re/usage/installation) -ii. Install either [`Docker`](https://docs.docker.com/engine/installation/) or [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) for full pipeline reproducibility (please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles)) +2. Install either [`Docker`](https://docs.docker.com/engine/installation/) or [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) for full pipeline reproducibility _(please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_ -iii. Download the pipeline and test it on a minimal dataset with a single command +3. Download the pipeline and test it on a minimal dataset with a single command: -```bash -nextflow run nf-core/sarek -profile test, -``` + ```bash + nextflow run nf-core/sarek -profile test, + ``` -> Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. -> If so, you can simply use `-profile ` in your command. -> This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. + > Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. + > If so, you can simply use `-profile ` in your command. + > This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. -iv. Start running your own analysis! +4. Start running your own analysis! -```bash -nextflow run nf-core/sarek -profile --input '*.tsv' --genome GRCh38 -``` + ```bash + nextflow run nf-core/sarek -profile --input '*.tsv' --genome GRCh38 + ``` See [usage docs](docs/usage.md) for all of the available options when running the pipeline. ## Documentation -The nf-core/sarek pipeline comes with documentation about the pipeline, found in the `docs/` directory: - -1. [Installation](https://nf-co.re/usage/installation) -2. Pipeline configuration - * [Local installation](https://nf-co.re/usage/local_installation) - * [Adding your own system config](https://nf-co.re/usage/adding_own_config) - * [Install on a secure cluster](docs/install_bianca.md) - * [Reference genomes](https://nf-co.re/usage/reference_genomes) - * [Extra documentation on reference](docs/reference.md) -3. [Running the pipeline](docs/usage.md) - * [Examples](docs/use_cases.md) - * [Input files documentation](docs/input.md) - * [Documentation about containers](docs/containers.md) -4. [Output and how to interpret the results](docs/output.md) - * [Extra documentation on variant calling](docs/variant_calling.md) - * [Complementary information about ASCAT](docs/ascat.md) - * [Complementary information about Sentieon](docs/sentieon.md) - * [Extra documentation on annotation](docs/annotation.md) -5. [Troubleshooting](https://nf-co.re/usage/troubleshooting) +The nf-core/sarek pipeline comes with documentation about the pipeline which you can read at [https://nf-core/sarek/docs](https://nf-core/sarek/docs) or find in the [`docs/` directory](docs). ## Credits @@ -135,7 +115,7 @@ For further information or help, don't hesitate to get in touch on [Slack](https ## Citation If you use `nf-core/sarek` for your analysis, please cite the `Sarek` article as follows: -> Garcia M, Juhos S, Larsson M et al. **Sarek: A portable workflow for whole-genome sequencing analysis of germline and somatic variants [version 1; peer review: 2 approved]** *F1000Research* 2020, 9:63 [doi: 10.12688/f1000research.16665.1](http://dx.doi.org/10.12688/f1000research.16665.1). +> Garcia M, Juhos S, Larsson M et al. **Sarek: A portable workflow for whole-genome sequencing analysis of germline and somatic variants [version 2; peer review: 2 approved]** *F1000Research* 2020, 9:63 [doi: 10.12688/f1000research.16665.2](http://dx.doi.org/10.12688/f1000research.16665.2). You can cite the sarek zenodo record for a specific version using the following [doi: 10.5281/zenodo.3476426](https://zenodo.org/badge/latestdoi/184289291) diff --git a/docs/README.md b/docs/README.md index 43fe182c67..9ae0a5eaf5 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,21 +1,10 @@ # nf-core/sarek: Documentation -The nf-core/sarek documentation is split into the following files: +The nf-core/sarek documentation is split into the following pages: -1. [Installation](https://nf-co.re/usage/installation) -2. Pipeline configuration - * [Local installation](https://nf-co.re/usage/local_installation) - * [Adding your own system config](https://nf-co.re/usage/adding_own_config) - * [Install on a secure cluster](install_bianca.md) - * [Reference genomes](https://nf-co.re/usage/reference_genomes) - * [Extra documentation on reference](reference.md) -3. [Running the pipeline](usage.md) - * [Examples](use_cases.md) - * [Input files documentation](input.md) - * [Documentation about containers](containers.md) -4. [Output and how to interpret the results](output.md) - * [Extra documentation on variant calling](variant_calling.md) - * [Complementary information about ASCAT](ascat.md) - * [Complementary information about Sentieon](sentieon.md) - * [Extra documentation on annotation](annotation.md) -5. [Troubleshooting](https://nf-co.re/usage/troubleshooting) +- [Usage](usage.md) + - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. +- [Output](output.md) + - An overview of the different results produced by the pipeline and how to interpret them. + +You can find a lot more documentation about installing, configuring and running nf-core pipelines on the website: [https://nf-co.re](https://nf-co.re) diff --git a/docs/abstracts/2020-10-VCBS.md b/docs/abstracts/2020-10-VCBS.md new file mode 100644 index 0000000000..ca4b00314c --- /dev/null +++ b/docs/abstracts/2020-10-VCBS.md @@ -0,0 +1,36 @@ +# Victorian Cancer Bioinformatics Symposium - online, 2020-10-23 + +## Sarek, a reproducible and portable workflow for analysis of matching tumor-normal NGS data + +Maxime Garcia [1], Szilveszter Juhos [1], Teresita Díaz de Ståhl [1], Markus Mayrhofer [2], Johanna Sandgren [1], Björn Nystedt [2], Monica Nistér [1] + +[1] Dept. of Oncology Pathology, The Swedish Childhood Tumor Biobank (Barntumörbanken, BTB); Karolinska Institutet +[2] Dept. of Cell and Molecular Biology; National Bioinformatics Infrastructure Sweden, Science for Life Laboratory; Uppsala University + +### Introduction + +High throughput sequencing for precision medicine is a routine method. +Numerous tools have to be used, and analysis is time consuming. +We propose Sarek, an open-source container based bioinformatics workflow for germline or tumor/normal pairs (can include matched relapses), written in Nextflow, to process WGS, whole-exome or gene-panel samples. + +### Methods + +Sarek is part of nf-core, a collection of high quality peer-reviewed workflows; supported environments are Docker, Singularity and Conda, enabling version tracking and reproducibility. +It is designed with flexible environments in mind: local fat node, HTC cluster or cloud environment like AWS. +Several model organism references are available (including Human GRCh37 and GRCh38). +Sarek is based on GATK best practices to prepare short-read data. +The pipeline then reports germline and somatic SNVs and SVs (HaplotypeCaller, Strelka, Mutect2, Manta and TIDDIT). +CNVs, purity and ploidy is estimated with ASCAT and Control-FREEC. +At the end of the analysis the resulting VCF files can be annotated by SNPEff and/or VEP to facilitate further downstream processing. +Furthermore, a broad set of QC metrics is reported as a final step of the workflow with MultiQC. +Additional software can be included as new modules. + +### Results + +From FASTQs to annotated VCFs it takes four days for a paired 90X/90X WGS-sample on a 48 cores node, with the complete set of tools. +Processing can be sped-up with the optional use of Sentieon (C). +Sarek is used in production at the National Genomics Infrastructure Sweden for germline and cancer samples for the Swedish Childhood Tumor Biobank and other research groups. + +### Conclusion + +Sarek is an easy-to-use tool for germline or cancer NGS samples, to be downloaded from [nf-co.re/sarek](https://nf-co.re/sarek) under MIT license. diff --git a/docs/annotation.md b/docs/annotation.md deleted file mode 100644 index ba94d85671..0000000000 --- a/docs/annotation.md +++ /dev/null @@ -1,90 +0,0 @@ -# Annotation - -## Tools - -With Sarek, annotation is done using `snpEff`, `VEP`, or even both consecutively: - -- `--tools snpEff` - - To annotate using `snpEff` -- `--tools VEP` - - To annotate using `VEP` -- `--tools snpEff,VEP` - - To annotate using `snpEff` and `VEP` -- `--tools merge` - - To annotate using `snpEff` followed by `VEP` - -VCF produced by Sarek will be annotated if `snpEff` or `VEP` are specified with the `--tools` command. -As Sarek will use `bgzip` and `tabix` to compress and index VCF files annotated, it expects VCF files to be sorted. - -In these examples, all command lines will be launched starting with `--step annotate`. -It can of course be started directly from any other step instead. - -## Using genome specific containers - -Sarek has already designed containers with `snpEff` and `VEP` files for Human (`GRCh37`, `GRCh38`), Mouse (`GRCm38`), Dog (`CanFam3.1`) and Roundworm (`WBcel235`). -Default settings will run using these containers. - -The main Sarek container has also `snpEff` and `VEP` installed, but without the cache files that can be downloaded separately. - -## Download cache - -A Nextflow helper script has been designed to help downloading `snpEff` and `VEP` caches. -Such files are meant to be shared between multiple users, so this script is mainly meant for people administrating servers, clusters and advanced users. - -```bash -nextflow run download_cache.nf --snpeff_cache --snpeff_db --genome -nextflow run download_cache.nf --vep_cache --species --vep_cache_version --genome -``` - -## Using downloaded cache - -Both `snpEff` and `VEP` enable usage of cache. -If cache is available on the machine where Sarek is run, it is possible to run annotation using cache. -You need to specify the cache directory using `--snpeff_cache` and `--vep_cache` in the command lines or within configuration files. -The cache will only be used when `--annotation_cache` and cache directories are specified (either in command lines or in a configuration file). - -Example: - -```bash -nextflow run nf-core/sarek --tools snpEff --step annotate --sample --snpeff_cache --annotation_cache -nextflow run nf-core/sarek --tools VEP --step annotate --sample --vep_cache --annotation_cache -``` - -## Using VEP CADD plugin - -To enable the use of the VEP CADD plugin: - -- Download the CADD files -- Specify them (either on the command line, like in the example or in a configuration file) -- use the `--cadd_cache` flag - -Example: - -```bash -nextflow run nf-core/sarek --step annotate --tools VEP --sample --cadd_cache \ - --cadd_indels \ - --cadd_indels_tbi \ - --cadd_wg_snvs \ - --cadd_wg_snvs_tbi -``` - -### Downloading CADD files - -An helper script has been designed to help downloading CADD files. -Such files are meant to be share between multiple users, so this script is mainly meant for people administrating servers, clusters and advanced users. - -```bash -nextflow run download_cache.nf --cadd_cache --cadd_version --genome -``` - -## Using VEP GeneSplicer plugin - -To enable the use of the VEP GeneSplicer plugin: - -- use the `--genesplicer` flag - -Example: - -```bash -nextflow run nf-core/sarek --step annotate --tools VEP --sample --genesplicer -``` diff --git a/docs/ascat.md b/docs/ascat.md deleted file mode 100644 index 718363e00a..0000000000 --- a/docs/ascat.md +++ /dev/null @@ -1,176 +0,0 @@ -# ASCAT - -## Introduction - -ASCAT is a software for performing allele-specific copy number analysis of tumor samples and for estimating tumor ploidy and purity (normal contamination). -ASCAT is written in R and available here: [github.com/Crick-CancerGenomics/ascat](https://github.com/Crick-CancerGenomics/ascat). - -To run ASCAT on NGS data we need BAM files for the tumor and normal samples, as well as a loci file with SNP positions. -If ASCAT is run on SNP array data, the loci file contains the SNPs on the chip. -When runnig ASCAT on NGS data we can use the same loci file, for example the one corresponding to the AffymetrixGenome-Wide Human SNP Array 6.0, but we can also choose a loci file of our choice with i.e. SNPs detected in the 1000 Genomes project. - -### BAF and LogR values - -Running ASCAT on NGS data requires that the BAM files are converted into BAF and LogR values. -This can be done using the software [AlleleCount](https://github.com/cancerit/alleleCount) followed by a simple R script. -AlleleCount extracts the number of reads in a BAM file supporting each allele at specified SNP positions. -Based on this, BAF and logR can be calculated for every SNP position i as: - -```R -BAFi(tumor)=countsBi(tumor)/(countsAi(tumor)+countsBi(tumor)) -BAFi(normal)=countsBi(normal)/(countsAi(normal)+countsBi(normal)) -LogRi(tumor)=log2((countsAi(tumor)+countsBi(tumor))/(countsAi(normal)+countsBi(normal)) - median(log2((countsA(tumor)+countsB(tumor))/(countsA(normal)+countsB(normal))) -LogRi(normal)=0 -``` - -For male samples, the X and Y chromosome markers have special treatment: - -```R -LogRi(tumor)=log2((countsAi(tumor)+countsBi(tumor))/(countsAi(normal)+countsBi(normal))-1 - median(log2((countsA(tumor)+countsB(tumor))/(countsA(normal)+countsB(normal))-1) -``` - -where: -*i* corresponds to the postions of all SNPs in the loci file. -*CountsA* and *CountsB* are vectors containing number of reads supporting the *A* and *B* alleles of all SNPs -*A* = the major allele -*B* = the minor allele -*Minor* and *major* alleles are defined in the loci file (it actually doesn't matter which one is defied as A and B in this application) - -Calculation of LogR and BAF based on AlleleCount output is done as in [runASCAT.R](https://github.com/cancerit/ascatNgs/tree/dev/perl/share/ascat/runASCAT.R) in the ascatNgs repository on Github. - -### Loci file - -The loci file was created based on the 1000Genomes latest release (phase 3, releasedate 20130502), available [here](ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp//release/20130502/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5b.20130502.sites.vcf.gz). -The following filter was applied: Only bi-allelc SNPs with minor allele frequencies > 0.3. - -The loci file was originally generated for GRCh37. -It was translated into GRCh38 using the tool liftOver available at the UCSC Genome Browser. -To run liftOver the loci file was first written in bed format: - -```bash -awk '{print "chr"$1":"$2"-"$2}' 1000G_phase3_20130502_SNP_maf0.3.loci > 1000G_phase3_20130502_SNP_maf0.3.bed -``` - -Using the web interface to liftOver at [genome.ucsc.edu](https://genome.ucsc.edu/cgi-bin/hgLiftOver) the file was translated into GRCh38 coordinates. -LiftOver was possible for 3261270 out of 3268043 SNPs. -The converted SNP positions were printed in the format required by AlleleCounter by: - -```bash -more hglft_genome_5834_13aba0.bed | awk 'BEGIN{FS="chr"} {print $2}' | awk 'BEGIN{FS="-"} {print $1}' | awk 'BEGIN{FS=":";OFS="\t"} {print $1,$2}' > 1000G_phase3_GRCh38_maf0.3.loci -``` - -### GC correction file - -Input files for ASCAT's GC correction were created for the above loci files, using the scripts and instructions for this on [ASCAT's github repository](https://github.com/Crick-CancerGenomics/ascat/tree/master/gcProcessing) - -#### scripts and data for generating the GC correction file - -The following scripts were downloaded from : - -- *createGCcontentFile.R* -- *createWindowBed.pl* -- *GCfileCreation.sh*. - -To generate the GC correction file additional files are needed: - -- *locifile* described above -- *reference.fasta* is the genome reference file in fasta format - -The files are descibed in [Genomes and reference files documentation](reference.md) - -- *chromosomesizes.txt* is a tab delimited text file containing the size of all chromosomes included in the loci file. - -An example file is available in - -#### Modification of createWindowBed.pl for our GRCh37 loci file - -The genomc reference file we use in Sarek for GRCh37 is coded without "chr" in the chromosome names, while the genome reference file we use in Sarek for GRCh38 includes "chr" in the chromosome names. -The script [createWindowBed.pl](https://github.com/Crick-CancerGenomics/ascat/tree/master/gcProcessing/createWindowBed.pl) assumes that `chr` is included in the chromosome names of the reference file, so a small modification of this script was done for the process to work on our GRCh37 loci file. - -These two lines in createWindowBed.pl generate output (lines 61 and 64): - -```perl -(61) print OUT "chr".$tab[1]."\t".$start."\t".$stop."\t".$tab[0]."\t".$tab[2]."\t".($w*2+1)."\n"; -(64) print OUT "chr".$tab[1]."\t".$start."\t".$stop."\t".$tab[0]."\t".$tab[2]."\t".($w*2)."\n"; -``` - -and were changed to: - -```perl -(61) print OUT $tab[1]."\t".$start."\t".$stop."\t".$tab[0]."\t".$tab[2]."\t".($w*2+1)."\n"; -(64) print OUT $tab[1]."\t".$start."\t".$stop."\t".$tab[0]."\t".$tab[2]."\t".($w*2)."\n"; -``` - -After this modification the script works for our GRCh37 loci file. - -#### Process - -The following sbatch script was run on the Uppmax cluster Rackham, to generate the CG correction files: - -```bash -#!/bin/bash -l -#SBATCH -A projid -#SBATCH -p node -#SBATCH -t 24:00:00 -#SBATCH -J createGCfile -module load bioinfo-tools -module load BEDTools -module load R/3.5.0 -module load R_packages/3.5.0 -./GCfileCreation.sh 1000G_phase3_20130502_SNP_maf0.3.loci chrom.sizes 19 human_g1k_v37_decoy.fasta -``` - -where: - -- *1000G_phase3_20130502_SNP_maf0.3.loci* is the loci file for GRCh37 described above -- *human_g1k_v37_decoy.fasta* is the genome reference file used for GRCh37 -- *chrom.sizes* is the list of the chromosome lengths in GRCh37 - -Names of the chromosomes in chrom.sizes file must be the same as in the genome reference, so in case of GRCh37 we used "1", "2" etc and in GRCh38 we used "chr1", "chr2" etc. - -- *19* means that 19 cores are available for the script. - -This created GC correction files with the following column headers: - -| | | | | | | | | | | | | | | | | | | | | -|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-| -|Chr|Position|25|50|100|200|500|1000|2000|5000|10000|20000|50000|100000|200000|500000|1M|2M|5M|10M| - -This file gave an error when running ASCAT, and the error message suggested that it had to do with the column headers. -The Readme.txt in suggested that the column headers should be: - -| | | | | | | | | | | | | | | | | | | | | -|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-| -|Chr|Position|25bp|50bp|100bp|200bp|500bp|1000bp|2000bp|5000bp|10000bp|20000bp|50000bp|100000bp|200000bp|500000bp|1M|2M|5M|10M| - -The column headers headers of the generated GC correction files were therefore manually edited. - -#### Format of GC correction file - -The final files are tab-delimited with the following columns (and some example data): - -|Chr|Position|25bp|50bp|100bp|200bp|500bp|1000bp|2000bp|5000bp|10000bp|20000bp|50000bp|100000bp|200000bp|500000bp|1M|2M|5M|10M| -|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-| -|snp1|1|14930|0.541667|0.58|0.61|0.585|0.614|0.62|0.6|0.5888|0.588|0.4277|0.395041|0.380702|0.383259|0.341592|0.339747|0.386343|0.500537|0.511514 -|snp2|1|15211|0.625|0.64|0.67|0.63|0.61|0.612|0.6135|0.591|0.5922|0.4358|0.39616|0.380411|0.383167|0.34163|0.339771|0.386417|0.500558|0.511511 -|snp3|1|15820|0.541667|0.56|0.62|0.655|0.65|0.612|0.5885|0.5936|0.5797|0.4511|0.397771|0.379945|0.382999|0.341791|0.339832|0.386554|0.500579|0.511504 - -### Output - -The ASCAT process gives several images as output, described in detail in this [book chapter](http://www.ncbi.nlm.nih.gov/pubmed/22130873). -The script also gives out a text file (*tumor.cnvs.txt*) with information about copy number state for all the segments predicted by ASCAT. -The output is a tab delimited text file with the following columns: - -```text -chr startpos endpos nMajor nMinor -``` - -Where: - -- *chr* is the chromosome number -- *startpos* is the start position of the segment -- *endpos* is the end position of the segment -- *nMajor* is number of copies of one of the allels (for example the chromosome inherited from the father) -- *nMinor* is the number of copies of the other allele (for example the chromosome inherited of the mother) - -The file *tumor.cnvs.txt* contains all segments predicted by ASCAT, both those with normal copy number (nMinor = 1 and nMajor =1) and those corresponding to copy number aberrations. diff --git a/docs/containers.md b/docs/containers.md deleted file mode 100644 index a926d1f709..0000000000 --- a/docs/containers.md +++ /dev/null @@ -1,147 +0,0 @@ -# Containers - -Our main container is designed using [Conda](https://conda.io/) to install all tools used in Sarek: - -- [sarek](#sarek-) - -For annotation, the main container can be used, but the cache has to be downloaded, or additional containers are available with cache (see [extra annotation documentation](annotation.md)): - -- [sareksnpeff](#sareksnpeff-) -- [sarekvep](#sarekvep-) - -## What is actually inside the containers - -### sarek [![sarek-docker status](https://img.shields.io/docker/automated/nfcore/sarek.svg)](https://hub.docker.com/r/nfcore/sarek) - -- Based on `nfcore/base:1.9` -- Contain **[ASCAT](https://github.com/Crick-CancerGenomics/ascat)** 2.5.2 -- Contain **[AlleleCount](https://github.com/cancerit/alleleCount)** 4.0.2 -- Contain **[BCFTools](https://github.com/samtools/bcftools)** 1.9 -- Contain **[bwa-mem2](https://github.com/bwa-mem2/bwa-mem2)** 2.0 -- Contain **[Control-FREEC](https://github.com/BoevaLab/FREEC)** 11.5 -- Contain **[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/)** 0.11.9 -- Contain **[FreeBayes](https://github.com/ekg/freebayes)** 1.3.2 -- Contain **[GATK4-spark](https://github.com/broadinstitute/gatk)** 4.1.6.0 -- Contain **[GeneSplicer](https://ccb.jhu.edu/software/genesplicer/)** 1.0 -- Contain **[ggplot2](https://github.com/tidyverse/ggplot2)** 3.3.0 -- Contain **[HTSlib](https://github.com/samtools/htslib)** 1.9 -- Contain **[Manta](https://github.com/Illumina/manta)** 1.6.0 -- Contain **[msisensor](https://github.com/ding-lab/msisensor)** 0.5 -- Contain **[MultiQC](https://github.com/ewels/MultiQC/)** 1.8 -- Contain **[Qualimap](http://qualimap.bioinfo.cipf.es)** 2.2.2d -- Contain **[samtools](https://github.com/samtools/samtools)** 1.9 -- Contain **[snpEff](http://snpeff.sourceforge.net/)** 4.3.1t -- Contain **[Strelka2](https://github.com/Illumina/strelka)** 2.9.10 -- Contain **[TIDDIT](https://github.com/SciLifeLab/TIDDIT)** 2.7.1 -- Contain **[pigz](https://zlib.net/pigz/)** 2.3.4 -- Contain **[Trim Galore](https://github.com/FelixKrueger/TrimGalore)** 0.6.5 -- Contain **[VCFanno](https://github.com/brentp/vcfanno)** 0.3.2 -- Contain **[VCFtools](https://vcftools.github.io/index.html)** 0.1.16 -- Contain **[VEP](https://github.com/Ensembl/ensembl-vep)** 99.2 - -### sareksnpeff [![sareksnpeff-docker status](https://img.shields.io/docker/automated/nfcore/sareksnpeff.svg)](https://hub.docker.com/r/nfcore/sareksnpeff) - -- Based on `nfcore/base:1.9` -- Contain **[snpEff](http://snpeff.sourceforge.net/)** 4.3.1t -- Contains cache for `GRCh37`, `GRCh38`, `GRCm38`, `CanFam3.1` or `WBcel235` - -### sarekvep [![sarekvep-docker status](https://img.shields.io/docker/automated/nfcore/sarekvep.svg)](https://hub.docker.com/r/nfcore/sarekvep) - -- Based on `nfcore/base:1.9` -- Contain **[GeneSplicer](https://ccb.jhu.edu/software/genesplicer/)** 1.0 -- Contain **[VEP](https://github.com/Ensembl/ensembl-vep)** 99.2 -- Contain cache for `GRCh37`, `GRCh38`, `GRCm38`, `CanFam3.1` or `WBcel235` - -## Building your own - -Our containers are designed using [Conda](https://conda.io/). -The [`environment.yml`](../environment.yml) file can be modified if particular versions of tools are more suited to your needs. - -The following commands can be used to build/download containers on your own system: - -- Adjust `VERSION` for sarek version (typically a release or `dev`). - -### Build with Conda - -```Bash -conda env create -f environment.yml -``` - -### Build with Docker - -- `sarek` - -```Bash -docker build -t nfcore/sarek: . -``` - -- `sareksnpeff` - -Adjust arguments for `GENOME` version and snpEff `CACHE_VERSION` - -```Bash -docker build -t nfcore/sareksnpeff:. containers/snpeff/. --build-arg GENOME= --build-arg CACHE_VERSION= -``` - -- `sarekvep` - -Adjust arguments for `GENOME` version, `SPECIES` name and VEP `VEP_VERSION` - -```Bash -docker build -t nfcore/sarekvep:. containers/vep/. --build-arg GENOME= --build-arg SPECIES= --build-arg VEP_VERSION= -``` - -### Pull with Docker - -- `sarek` - -```Bash -docker pull nfcore/sarek: -``` - -- `sareksnpeff` - -Adjust arguments for `GENOME` version - -```Bash -docker pull nfcore/sareksnpeff:. -``` - -- `sarekvep` - -Adjust arguments for `GENOME` version - -```Bash -docker pull nfcore/sarekvep:. -``` - -### Pull with Singularity - -You can directly pull singularity image, in the path used by the Nextflow ENV variable `NXF_SINGULARITY_CACHEDIR`, ie: - -```Bash -cd $NXF_SINGULARITY_CACHEDIR -singularity build ... -``` - -- `sarek` - -```Bash -singularity build nfcore-sarek-.img docker://nfcore/sarek: -``` - -- `sareksnpeff` - -Adjust arguments for `GENOME` version - -```Bash -singularity build nfcore-sareksnpeff-..img docker://nfcore/sareksnpeff:. -``` - -- `sarekvep` - -Adjust arguments for `GENOME` version - -```Bash -singularity build nfcore-sarekvep-..img docker://nfcore/sarekvep:. -``` diff --git a/docs/images/SciLifeLab_logo.png b/docs/images/SciLifeLab_logo.png index bc4dbda623f8f70d82f883c5019638d86016a298..e71d44b9319ed8f02e860517807e1269eaec9646 100644 GIT binary patch literal 12212 zcmYjX1yCDpwB6wD5?qSAySr0pX`#4Nptx(0;_eQ`N{c(iDHL~i*WyxKUija_sb=(!yd0`zk%d7=dR1?McK_#O%R@ecLFhfQ2Ow*%@fVLBjfdIqhbQp+ zlQ^Z?cF1D>7F(MHABBrw{Ss9ffxrJ{qek3$ES!V55yn-zD3hR;DQ3JeRNXcVr z2Fle<>-Dd8leKnsEQcgzExTl_P(yyZt`@eL@BbNzC~-q<#(}-}oBKnF2|OV6cP!Vt zeDWted=E8{L|$M4^VFvNom466P_*PePt43yqw=*@rdu#sQ2Y(8e)1Hb_sw;T) zzdrv1OxuTWxiHVSwEuh4Ck@pc44bp?LxGF(tAPuFt60a4e+}U-sQs*-SU*yw7qUc z|1T0MXe1y&#vRs(Ju?o5!ds`~KcgAYTHz)pY=v3ZU|$-4Gzm)m$NXx#{M@XuNmD~e zH1X+~4nQ}U&RuPCO(gtBu(j&Xi%z3TsvevOkVYJh2>}v_og`^iw2G~8xz~F{rUu&Y zR#usyHbk@MNCxnZSIysv(}!h(rhnSu zf7oZqpcOsJfx0!RJfoL~2w4~^)>94kWc%1++S??Hub$>SH~IPLkmrz`!ylctUD9}5 zmw3RYlBnfYE4Agdb6h#`a?7pF)Nc{4ApSnpui)Eq@{KupxS7`QWw^xTFbv@eQAF}d zGxKsBy%<=)N27={Me@+AU8_W3b zb~)#EGr_MdeLZcP*Wvj4a)}51JIKMN*ym}Pmmb~W2b=C?iW+K7&s4u!jWa%XMNW&! zp!-V2$JfD;ZOu0h+c{%OK%0Im)l}CmHns4~$p5$czle(ZheI26PgjM$oBs_bpRIh2 z1c#SwAGE$$9hv)Pot*#gg?XKNi$-j=(BLw{x>0NM26SYO=_{fiy&e9@g!NOa|z)N+n) zoFFVZ7+pWC0{F2n&5z88dmm2w;oGXex@~OD#GR9E)5ogEXL*;D3wtYfP=7X2;cCV2 zJeM^cD(vR7P#~Kf+K*1}d)m zGJa;gV<3v{D*)ba=CPn0df{S|{kmMMlvll!6&SV*hYPgi+~B^8;ocTLH=CS#KmEtr z$_lJ^dZ}vJ%LBklSW&G9WQ$kQ2X^JD4JRTA;6xz+L13`Vf{XzL>}{GPrwL0Uv4IK= zPHF4kNRr>p&%VIk9&6{75{tQCy_4wQJ1MU-dN?uBHAe*XSo*|E=7sM*5g1oDZNjb! zbU|BT+Z#q1NR+~AD)Sh{J1UR%>0N6U|w%cbj8Z z$MO*&a27MF$k1lElNLgbvrj@oawybbmyME-8#Xar?Yab>tg)8>1K0pon!ANl{t-wL zO*^Wp!XAF2L)!!-IT~A}sr>V1>V0g*VeYl02*#PlGHqF?>l1$DzU>sxOX8^>-JL9i zZO}a{EGMMTTMh5cG#0U9hF)h>C;gfk50`_RHs&g^GM$}!X}JOo7eu(c5^~F=K;HIn zzA{&bQ$z8-D-G2sMEB_oZsIKmwI?Pj4bwvQDg-)pG9=vyXp9VhV-gr&TbgtUi!*6X z-nc|AK|H;#wglEq!$7Li7AF^sV;mF9M5csnZl$&SQFf~w-gnb2$d5sqhei!-aO+-q zh?jf%m9<#G8M(d`$Z*6&n;9|XOQY~slugs-cy*ZIKCBI}gb#ZeNhxrzl~Hb3toh^T zsp(r}Tza)D`^B@3H9*X4tiYwNH*F1$bVBbdoCZ18__nD;D&>nI+}W3Sqq7k!>T@5# zXdY1e2T>2!@7#@$2t%A99Ey~w_I!^K$EfGI@Z$Rg=Hg|lLJ!o=k087Rx!JzX10w?- z_diK}Ck8Wh{tKcjiYRMAKtr+~Xo}XIHG6ig{MlvWpUT5BxOZ$-$gbmO-Ek*JRM79{ zb3(dm(%>Nwn#-^731IDj-Z;edsi7B^*k%`Fk(r`$&4U~y82Ct!@dXubtW-Nx1aJ1= znDAqn(SW(AA|gzHWVzSbDX{hx=h6Vl1b;s{!-8agua1;(zDBlHHY)S2iG`p=*6}Ic z&|XEF$hSiB6AOziXA;n3Q4|xx1EX&`iSI)6hGaJU!YX3-&lxv1RS0_-qWvsb8I%P& zy?GA?#z)AW8rn&a;ne>%mH#w3d(S_31vkvo4$*{98eqi|Wrh8cYH<%~zUQc@iv z*+0FzN!o~WidSaT!!#@2{l)qY!~GkZ#9JNDv+#Wa{xvQD2oNj-t=YTG1n(^r-kI65^$vmmelD^6bO4&@S8~ zPGOsBviw~BXp!P36WMszFP>OWS7|)xwvWmhMKNDY#{veRaSM(HhzEBV0uD!9Is*4C zB@7Cg9TiLk>S2ha-`d@Tm5Y3}xV) zO4!!8fMkFV1TmU0L?Nf9aUp0eP7-5KD%wW7BJ*wV#axXNJk5>YBm@S%+U{qJocpqN z-mL;0zoM+!THFtk3=Q~=Czf`ub~;M7=_8$HH130}>)RrY!T&VfM%g*~b!D`BIt$En zDrIzaO)+-rjW`58D3HRgC&1+X9*sM=It8*K4eDsrYbP;vGEQ|RPQJS#G~Ai*enW8| z!VQ-((URdO-#uE+(-XS}i8Fn(?uw!ijHN%Og zuxFaQKsU26gXrTKbZ7F7ll@#Vf8|L2=kz79XaCpriNR%!u&T`#S1HV?^yI^hn)+#SQ zWLjabG(-A`!!YO6QE%iq_|vRr-`v=^J8hR|05k7y`-Y-CuyvB;2z#IRhdyE~?g7$o z*PYp};AHbTkPM4*Q-Zu9@D+|_4m&lMxJmB~`W6z5;l^8*kziC4#Y>ebo)M?T$j06m zBJL47>S4HfxBg(FwQz86yqmdKh6mRjp@|@NR4AwwbzL9EPnCtfSLJem$$d<#__Lc# zVmKIC^b}W|`ye`y{chat54D)SWDLuA{{!9&SHg%Z2R=i#6-_ z(GvDl*>^Vhi%&8oR6IDrxJQFnXAaXVlrQ0FpT%I}rUZ`Tjk~Lk3u@-Z*Et5#SPTV+ z-0_QcT~M!v1?n`>2?cU8bFmKf|Dpg_@Ehk3tJy+@S4=-~_H(oS_J{1A$s^>XghupI zj;2J&=ug+GpF)A7cWlZ6=&TIh0np_Ge67qrCpRaz#56-8`y<#cAg8l}rk)Yb@j03o zg^H&!wN_)i<~BA~)Xh!tuyU#Xnt|!jI<1gJ5WmXCm(r=(*!vAZWWCoysQGC}2YQ}h z!&fBw7z&~3$fz(S^~nWKh;;bRN+S)t#$&MqI|uls)yAyEDWY%4PM$M(rhmN={XI}2 zI)8G!rX(xv(rax8z4Ic>e|W!cXZe|?KNzTp44zzvK#G$0L-9O5DJ*|}S2%X<_Y?#3 z_Je^-k>wkzYxAWgM>f$mH|!Op&~9|8!93cX=fJNB0E-8z?q;{puQ%;Z$Dfjezaxbh zqtWDlBfg}xAeUjej;CSe@K@=SH=AiQjI_5VhOXj^OH1fAWg9)49g535K4?RqBAc5_ zND`&oQLoIj&c4-^4V(X#|CHV`wdXvPWuCZ}#p!px79koR_K~{;CqS$_kA!(VhC4jK z?LdX5pln;=s}giu0$p4v0K{G9Zk(+Rsk~S?m}t!~Q^idOEcqH??Q)kc0CXFpve8@d1{RLZ{E^_o8hgVOS@n==xP;eit01AOqOW@Lz9 zymA|dWY0Y%-Zg9Zv)+XZ2Pw*Uk`3@s>YM!%ioGsBHwKDpQ;Gv7iHK%{USPJV8G8!C zW%3IVzn=u;c!uuj47`+ig7=eW~J8TU_lCMctWJfTfP1~7O4g!yiX4~d~VNWN+J&oZ~h5810ZZ9lADv@_S zEPx6?D_N{lyfKWtYDsApnVSG6=~6D;UH`x?mnV%OO$E^hR!%UbMq;&$6sz`gb7yj9Dm_Lex93(mEktsg}+)&y4 zb-!}{f_ZW=5eksp#nzH|)8ftIj;q(I&wG0P)^iyRc*{^nq2q(fHOn7IPoj?<0^<_d zdut(J^ccXh##(RQDBO4$nvU?@rs?j(+3(s;HbwixN@t9TNe@%&>5^5gRTE!8(&nM^}z;KGoPeJOHPHaGBTp2KfAihkWM z?lkb4s#m=mw6zhAOMl;HBmX1#^#oclC!8zNHY}gdy`NlFp?+Q!{ZrWL_Kb>n3I@jT ztJ0C2VpA)x(IW=NQX^s19mjw++D*_STdoBTdkAbmK^BVJLx2SzpFF5_UE_Q6dXIc` zE_*2%jNX8TeQ4L-BNS}UYI!&|UHNPLRCI2m-euelufcz?q*UEuWD zSpS6=ye6Z;Pv45a&Id3eL}dO_y&aT46!&Qz*qWa5Mx|SfIbw7E)!rbGm0d09j?208 z=)W{SuYEHS60Ht7OP0D<=#&G-ET+Jh`oIfJ@b~zU=a2n7TYhL>LOj%03sQ@RpQ+>f zL7#J9cp+p#V1OeI*op<7fIN~e9c*?B9)n~8^EF=Kq>n7zR?m9eR z05=}E{Gh#U2inKP@hAIn*pNAX>}YH}E&`*@T=#8q9Jbc#gAV&@x|QL4Wul|BZ=04s z9;Aa%Q`_oR!HYX94m*y}pLbrbp6ucxT(00y+UxA2@M-gxaaLn}{=bc<&gQgq?MY0e z94D|28X#GI?tOPimb%!QZcA)(G`XnPnu~m2HCz_b_#fbio)$+<`JhWsfsWKKHX_l3 z%TW|^=uAfP87!|5vQjh6Z=yIQCVP@oVmj2Y262dx7DQg>gzU*C4){~5OW40As(Y9E zD#L3as~0_UNH(D-6@@soy~EpV1#{xE)7P+#lcaySA&}Gj5XG~PF0W#qA0HoYj2&}l znd;xyuHgr4pXwCT?k`c*F-aFzpDP2gm;?6|ee2 zLPFjwLSN4K^IQ0T>bD5QV8Z!vWf-!@ztx{-U-=DH)c%~@PpT;%Ij zb#6#GadybberTSY^6OwGah{2I+u0P4q=uQvt|W=rm$*BeCLMZjm5Y4gq(+&?nF8I^1QfT zkc|owP}PrC!k%qnaCXb2j#~S))*fZhe54tN#Vb(}_AG)#ZaDR7D@jsC&ig;eKgvT5eXSR6N7#J(4+NLYp|Ev`aE2o{I3+%41lgJ%YM1w1!%vUku^jY?IUMqjf~>*-CixCvt%Hm*oT zab2vX#=;xEQ?5O_4FA4~ij)t*++xF{`b|dcY@^NZM=|@4o0yoGspgv|8%v#Mm7!Q; zIO`XsX`XC4id}r=A_Lm9aKP3Mr{tTV{>n~K!?*WIMmRUH{lf7HMC|cv?go6S&Wav; zZ+l)AlBhzeFk?hz7C*X++13_2k<8q&*?s=ZxMv)H*!ZslrAmEQHe;tM+fkAxj4i4N zN6jAF_K)hNWbtyhhW?eJ^m5#x#OlJ=xoF*Sl?1iGt?lGMGYOX;sJdNl$*1+9rC})Q zM^jXXdf9>_!IqU@^d9RnUA-7HR2~z$2n$Ng6zVV`m!Ny)>|I@o*`Tpt*j|@z4%@zo zWjvJN7zBp|eWC=HaFPL~{DJL`8r-iJroCDvUEk1)TH=help*@7@tA+huF~xaIc6qZ z=Q^*yVC9lpBSo_`X8$x58*PWA9rhCZWc)D#0McfHO|9{!SvTfO8f7FoXyLEFK+&LZ`Y=>2-NRSS zx~lO-qFZmxR6Nm*JcB+rXDkm>54(V=kM~TUCr{Q~^w*aLV^d2WwyKtro z1afr zH z_?#Bv2SngABl${NzhvRtb6$RR>kB_aLea%Etjd0PGe6=eM1zHd=2SbP6xp{$@m=$W z>RtQ#=4qWENeACTGADY$mEv095P8kOS71p3bA4WcX7c+Xe?+rxI`1z**!A>mLjZ!9 zLMDm?K-2k%WA+Ty3t#1JBOPGilSZmA$QFjE3_(^KWUS)u;DZLI%e;Zq2j&eeGc zCco43mz@WyY%#n}dVBFfwSVqAw;%bm-t@ytPz?Et4=%%x5iF_C?te&<9A@B~kW+)Z z-lo=2E&ZfB_d3h6{QgelLjYE%$`r{L3C2cij-5)^*2XkgdA~iv?6?|92Xv+mJl2;z zFH|AXTd{QiYG>FI?kjamzL0?l2s*}DfM7@eF)25wV-5c@7iIIituaFB%cmMM#f*T| zqd^XhM#M1kY^O^S+?*Gq>xcn&9PhLD>e>RPNBQFVx<==otciWy7Ln5?NYUiZ1v#J0 zGAs}w;;^^Q($d}QVq$BApsc7Pv6UT>1588@gHmHXTYxaL_}An?un|Xzbj`}ez`s|# zBxQnLcgI$%^nEPeCel({Dbnh(^XMlWw-!&u_rj<0BS>GR9?ZFjzK80#Uo3tktpxf{ z$~7CqgGGg)oPd1Y3ns#OFFTA+Smcb|1RNVJlTQ#u$Zdl11E~C$*D&>%yH_dvmoHyP zKq6@C8+4s>7GQah0CE&AQQBM0qNp+J&Lv~f+YxOp$6S4l8mq!?yPz1N#-z5m9)*WL zv910LM$6l>Q!p12E$S?=qFzz$NO;gG@2f>LeHbS|5oK$=)YZS+Jx`btoa<#p>fR}H zI&aVnM%RB?*n)j3>+KWXD;IgjOK&g;65h%}1g%K-Tf9v7ijH<;oxg z`Pf{AVRuhTWg)H8z+^{lUI(6{p0!q0#KzDjPQ{;MM3q5SoLmEM5=E19LXA4$(DJsz z)~m={dEiBqre%s1*?IW*W{>9^zSNlAQ*-8`B)qBd@4#PhsBci1uzs+xY}DuX{&PIa zi`Z5*igO_V?2N{&4)$2;O*L6*aMF+Hix;eX`kqt%9XryJa;#Ffb-g)=po%b7s9Mf8QN~en^M{=6AJSNiphO zL-wIXdh>Io(%K~Y7cP!uzJIN_h-b1G*&Ea_d<Wp6Rw`F zB`b@D+jaeFvAOTiY}4mwDvONo)x}>y?IaDK&06wFeuum?ceSG@-ev%Ul*A|s&_c|PU%67lS7MXkZl<{Nx-c_4j$g%1ZQms?r zIlPmPLUbx;w;1j4^V>UrJX}t0OFP=qlVeOoI_MQud7NH-zRE%6T&muDa;+*^Ew)e}+VDM2;6L#mziD!SxAb%x8vb?drKe$u&EB+oj(1tdC zh(<(j!7W`ElP-)Quns9#PGK4F?k1m&egVLLl`5GTx?P`0oe*7ho0pkkpV;Q2J?}(; zj-|VuW>0213flBBmue+J-|gBQvv+^XO$N4WHzd~K;6HGHSZu2W|aJDcIXqo1}sdEgV&sfc=8jE^xcdAK>p|6igjjPYm45!BL`71Cb zJGJ@qN*3PKdQ+)OG)C3635|OnRQHX}u5MwThPJ5t{za~Vdqo|gof+4trCbPezWU1& z!(`+U0ASL-Y614f&9z{$?M*Wc1(lEXO!$)x0>c`YEISM5OnwAghNpPdLXl!ng1sZd zbpzhj3>2L$qt5;*D+7CkacPGe?mYwcrEPOR3;mcsUB4P}SXbxImvv#pO)+zvE7LIG z5VjAw@GidMOIUF8P(`t;60X?(uIMRzv^vbWYOC%}Ui%Tz6M(~R5Rb8&0IcS_f5t}J9X!Md+;9sp-qR0xDbAM_CR9cctvQ&S6 zV^~4TPg}tiHOFmakVQcV5>jr-rm2Q^ZD%i2PG4f4{XzVvXY0rUta#EsRpyn?9-yL& z`8%@*&jm`_ikj%9rNsbSN3Di|^dH*QF_7)jnu}V>57%DE=J%E;7NGg&N58S8RT-`& z_JCx8A5!`UTERU(s>Y1|z2za+H~d9D!_nZMsRFaI>QN$LSSUF3T@P;kedXUdpX^*W zHZ9mN@hairs0p_|QL3!jS$9K%Z^=Q@bLpj48!^$ckYgutLeaOJU;=-|D2Cr`GS;rI;3f|$(%_ti zx^8^|LU);^#XNZ-M9GI*>=M_E!#dVEM@aKiSJ=f$i|8vjm8mST~7#Z&P#><^3=%{Zx{if+HlF5T7+_0rumcYbG?9cabyG!Yg z?<&}-v(_auBW(6D8$yXvqphvL2H!@naBMSY)Gr-5AM^zl1zA3xYFEj$>b85PV&VFa zeA^MN7-1SGv^u#M_|CCqJ4TXV>#-VwYVu>61h8$?G2ayK#U%R_FNlcUEr0_#R(TIu zU40UsFuOnpN^M6!-&#HOG7lcECAYMt8RJ^p7E`WkkIa$^xs2Nuytk+-fB;Z?x!j!okT8V& zCltv;WS$_1WQzKd+Dm<=k73?xTYCBsiE{O|u;+$!P`hO6IcMu(81CVvk;a}Et$Su5 zHPCkMQTo#+5&zP=<>YL$V6tH+wR3j84Xb%ZNSP~`iLXotp6&|IW zBd?!m%KMx`@`Z7gs^jGW;9E$cTE?IJ$KEnuMlcbL7*!pz(T(Jq&?RW#3i0PL&GbwTZhgY@8eQvJm20tN)JZE zKqu)nH4;IZx3{W~T&vLodlg1PG25#WF2j@ZvJCf>VF8aHm+eJ_^R1!>1@HZuKMiW2}(5JEP~l+efWCyxQPb$HYz$rj{I509A}JIDD0~!ghKkKLB}r$ zAhfusRO%2;W;_&po_2COBzdW3M@+?-Z~dMh68g)jS`wk~X#_M?(vh*lK!Kw50@*qn zx4k$<1DhEY-kvTqrs8=Pq~LBf z_^GgcKrlJ|FyjKj>z&HStCcZ3t;)gwXU*b3@Oh$VrK>00ZG}9t&F6SbEC08>}v~q&>}4~(Ye4@ z#mRfeMJCJ!RORX~svzJbI202_r_?bLaATkz$~iOcrF=d3rpV4gJ{Ix z-5x`bMA^G%{!}*%q#VX+uw# zJwW^UTNl$2OXO3_F?LMx@Zs;~cKE!^jjX|sZ+=$~3lPwZ&l%EpH^&3};($HK7_}xD z%}mZ8=oZYLsGSkA6MsR5`CvUMG;=^_k_v^bqPvXF@TfdFk5YTDVD}Ls_d~+$%HYl# z%*yCC%OUe!!_r|M0uqT_om~s@u=0-4cXd(^f4`lhHFr!#b4~GsuKFc9>{gnOLyUbS zO}%;<=TCXU!HIc_!LFu=U<#}=a$I*VD+-tp&@JBEv+pZrl8GZSzuoYof+PVk!`?eu zZf@Jq#u_UMfH2^enh}JA>NHmr+B9xR-hJZr2amBNW!a?ve7v*M z2&)3vLYbryV1?2?>KIB7WQo^D)LVD$57&)kUhe}|L(GJtf5!Rdkm1Nr|J3I4wCUk6 zFz9o%qC|7Jjy2iaYi)}4b|3aY_B%e(;+*0OT06lB@A(Pc!lcuPXR@cOHoDa~ zGhqz+snzgPzdW7(#;W3x7T`=#Np}AB;b%Y3NGw{nfEve3pada-qcsJV90vzX9u8ki z-bd~@OM2X0Z`s^W1^LCE_S$oZpzR$!e7fJ+!rUG?Ih9!080^)42Z^)QDdPlQBBwup z*?YgR*B3f<)U3%yPwbt~n_Ak)lxW_+BiG1w8-k9Wz5Y{@zC6S%x>=9HAfj3rMrUg% zXTj8enxH`Q!Sz#mHz0E8wC2`lBtp%5?m5My6?eL!GIbP2pGe4gD+Oaa7u-Ur8cZ>F zI%TiKm43u4>6Z$v6lU|rY_%4B_v;t*TpZpJdu-z$(&vf{pKCc+n|{{2>eCM?4tumG z`3dm{gbthvS3{G558v~d#`n&(+^rjhhnYxvE5j^-vvV#GQ;gGbyf#5jclOUHDcdG9z@K9y zFyH}GP#{N%)ukX&=~%HyGjnsG=ijr6*PLZX(J<3o(3!Ks7au6ni)RyE?og0h&z7O` z%$&hPALxJ}fvPbXMs82LKG)-q!{^rz*0$-JuoPAe@S1uQk`~Kq>XOD1$dfvaIANR#!&I;Z#;N_v^ zD)h*6IS4?4G=|aX3-E(!fi}7@!;&7AtR5XN6{97w33W4k0!F{PLFn*&A^Yk( zZ@SuIgWItI_pES!ymz^Y$u@I-VcEG)&|#C2MxWS&4Hsn`7?7QK;&%Aw10;d64)Pg( z{!<`av4EE(%C-rIBku^3A>M!GJsE&gMLN5;w^9@Y{my-n_{B+dE9IAX6S8h`u)2Uky9R3BiJ~$2a3^mbbaW>Sfkz=K}2D&)VKn zj?2iX0;*DOzYSooTT@ywjc7RyLOY!`rhJfN1eKjiKe3d$I;X0q_jWqZe9qn-12IuhvqVPYHgnS(j|AU2q08aZ-?p()5DdpMueAg1=z>}iXycUj7P*)%CO65MrZY3Ze_o12B^kEAcfqO|lvQ1+xt zH@vC6-B0&hWvxxLF&NfYEVrY!mK#^8sDUR{@NqsF+DnJ^IX$I7j6`i+J}*({k(Cun zORo~v%KYB_%X#Cw0^ldt#8%EhV#o6urN+`%A*LGB_|o}jf4T5lQsqA~u{g=ecxtcOUa@a5ZPBl4SLx0v zeA^o7h^u9khW{0zGK~vUrmTe^eg-=mkGDdp?d8Kso!4i~6u`6v+4S;P!kxj6kx-YV z=r;JzUB2pvliDIsuHfhEB%gsZE7QQTk%aAE-1Kd$C&nbu{|y=_&<-<>zsgM4V`A-}pJ;D1&H_9Qmh7LV6uzW;_UG4Exc(Q8TLFx3_=&e=gq=^ga{E7f9EYJR z$H-RF%U9hq9>6r7dievf(VGbDpxQhnMssD$U;~ysHp6ajm&hN z`^V|d^mJEsS65e0_hg>$=QE#Vs;jG>>8|?KdCob{IfBD6!ti_d-5bfgL(#X2G7Xrl zu!q1#0EO2@^)Zkotl#|t|M2?;8a!mjFd~f}fWr|7e_8vaAn?u<)iZ#-EA21v3Is>^ zJ!_WKe{_y!qNULTa5%~_w{QAB8CtI*tH%r48uM+h;>l6|Y$s@=5hrSnro``?-G;9J zk}6*Y9j7thC$Jus_xZC|E~@JQ9Zidyema_txl3nswrJ*dRc7Qiy^8V&guj8lEaHt- z)oB9z038aM+XMD;cR6*=y*?v-~odQ6UyNs{U31mIo39AGzK7r+Bv1|A2N0rvvyQW~RUE9c_A znFopn3xF0CKH~&!M6(m->)bo{Z~p)H0se3dpc+v-7?_I*S}X7(@F4IH;5lF-a0u|Z z-0SxMHvo46VabmdVdB@DaO_RT#Y<*(iI<&O*OLU97cZOh=x^(Kj$!1B4;`>=W^?~7 z3T>i#|E1ll8#U#e15By&Ii3KP*7+R6MLY0zO2uo3i2YX#)3{=Z+GODSz)`v17h~df zAK_xv2Al}Y18xKU4%`gf2t1nGXP?|LRHRM2ic9We%M+>?GWH&Kqs($gU2$2 zNddMfKZwE!X?qE1phse9geTB{ScN1osMvAb0ycE01aPm~!j@d=$IhcT0GTzhT$^ zkFlA|)=CR-H1L0c>1Fo27C2`(22@Ve&cZA-FZcVoz;_dz<4eHTBVIcVxFx|cM+|=7 z>^5b?ji{bn&HDs-9Lc`d?_RU2USn${CoLc|b6Urvz`g*A9(P&K>f0MUtYY@StgAUy zzVDB~@oe2uxNhW9j-^l%h5qH_kh{f+-y5Ys``%jnZ zrXA}Br2ojRRnP#srna9-PS8MK_S#zRZs>r*c@{Vp_)8V<`8(!*8BKx)@K4~2jUEJq zi&{JI!-CfW;DQ>@b0P5iknN))`SYhOqCOzbx(*Id!P_VGFnsiF9ahsc&|O7Q;M z?^?0Bkz*>uKY*`PvH$s)u#HCU0A6VHAc{pzfa`(r1>5eyoSd}~U_OfCG>>K%H38lX zdMzE1T3mi$S+em*Z)vD4Q*yo+Yz`Mn>OA8 z^kGrQT@xQsDk=Ks8e=YGKpNG*d3-Gt@M1W}n*CSoF7$!-0QKdO%CdsDSF)q{pup z7cZIF<%!;;Lqqp%3g5eU>CCrPb1c8FYcj%N`QHUzXWgVYg3)N=xC5H*yK4YVHT~QV zn_?C7j}*_hk)Z*HX0_DD0a4M)V}Um$I%-f^uvFy9M8~cp9{NFrv0*P|f^1b0tw0Ap z!qDq(?Fgw%`#&-EX}a&O0jzKI017Xz_eSh9CgpQp2U{vGG^?dHiSlFHInhzQ7)!@Q zn{IiTND!H6uPVD#b1V-zW$3|KlW(C`OTp{PYoBPOSO&VOQgJkY$B z+A~C((d&sbjSL|xUS)ra%BCd8t|x)WLs74ZNbFPZpp?y~rER{R|}#aBG;JIuQ@iF04& zEhWo_gO-sZYK{>?Uex-rX-`yDiGRa#wEmjnxo>A^l5aE9tl93vjxPjoUr_b?7IsYx z$nIU7y}YqyW182L!!cy~_dKvy#C{WjyMV)hmr^?S)lVh z(x@>xwoVFtYLy(e2Vm}CZWUKG4k*VmPHLBe;fXhIJR~QMeTPn_Me5VFgUJ0 z!*Oh_s01@gwpkSnha*)AMeWxy-?J<53v7{Z$%X20grZVXKv0~hIqE>6Pirw&? z?>!N-*nS9nmCDoCdNK~19`B9$1@qC|SAkcsHFaxEZt=aFcnL_9krDhE?hQ! zm$ChsDS;vCB*ahANsq}`3H3K%jhz%r(ti&4Gf-88yx38H8SyQ) z{QJ)W&H(-cxQc|?_6gVtD+hiua3Hqy>JDHLW&!Mmtz|tK`vL>3(!xzx)s4b{#~ZLO zd2C7z8b`Mrh?W`&X*n%aho`yw9ed^a>Vh^n74cTrFxmuO!z&OYy&edSse7tXVtLplliqkoQI!!tdobC5^+_Y(I>leOuz{6=p^qYk}v7P}ZQCb7=yTIouO(s4c z6SlK~&*XkqV|^BGU~s*58vIM{T)za~PpMGtglW))SY?6CD~uSEk7FN3z75v~toY=D3h#SAR%^bPp8_9Y@C%*>T!{%>6t|Nw zP%?MvjLuAuDf@W{Tkyk`yBV@1eyVpEl=^bBu=zwjb!a)UNcya(e`ih>_ zm1Y>HVzZHQd;s$|HHuu`MF;V+4 zrV-_sidmG!F0pl(=-wFirm#G*jljDxtL@DdP8x+^1t*Wo4Qeif%Z-vS7OU=Ydcn3V zX1d-He!dsJ@fRaDRH~pTpE~N>DcvE z!UgSh;Jg^`KOZ;-6O=q}!P2>UfY)jm+rMJMb`q5Z4d5xP&+u;;TD*2VW<4H;S#(3J z&Zx#K&tnAy$1!g)%ov$2YFdx?GMcd~QPtd^T>_3Tk?X#5D3i~W>6#)Q# z_sZKmB(|S*UT^#PDspi>$CO8q$7%ECR^D?Ezh})8fyk9bn|DXEKRanb>T8@E!J8ts zFCbn6nGIOerDEE#doC#_V$yo#BKAHhVf7PbSf z!PLIQVO)Xhf%g<`TM2x#%Hw>WaI;{1$e8{@yu7{9fd;W2ZvK+aPk}_WLrk3j{brJe zE&W-&UWzJeDD-qq>o_yL(KU>)Qcj9Z`{EL1MSYCqeL8l=Rbc<;;DtnBrW*N(-O zac&`w<#aO^8N8m_BO!0cl6S)@?1u4iOjL{cGV!V=6tWugDHOAg@HMTAt33YJOVCoW z-1fzbI}Sqh3=ftGEG_yesZA%aK4ibBpOPt$z;iF{Fz~}9f6iAAT=fD{8s%3wA5Gp| zr%AX7OJgeJ2C56(ttEV#Ig#k4lyUeyRi(n2#(8pTwdlc@sUrdKHGHB)G)T9Glv_XUD!?KKOO{fvX-Q zqUo>S*YR_d;|sSb+<$4$s-r8Nd+y@)X)T_Fq+aMRub#0s?8zMu<##Z2Ou+Pum(4un zavp8=Mlj`g1lS)-v)UbdI9J;+Jizi!OM*LCWHqUOyw+5xuL7UIqNBxp2TS{Sy;>jF za?JTZ2MZ)5Wn0W27FC!MZ2rD|$oA_#!(l7?Xkj1IZlsVC~|L0^SNdknk8gVNSlV4{1+q{vO6p6P_n#8FAlMd(XdEPr)6iQ()9%SJQVE;lL2^Ur zucTurJOw;NcpmK=f%Y`&nJMOg{}txIzmV!WP<#D+B-30LGUCyMpT}5F_6Vy`p2=p*ikiQo zYrJmAh9-ZDauWlY^EQq5QeT)pZc`A}-*mLmjYT@sg_lD$g#*0tr8& zAk12UGZ_@Mca+&L%_UckVIY7#itQzV{uNKJEgxOp*#AcGLa)!jEiE|&&)j-)dCM=r+b8-( zb%ItO#$v|E6w)t9k>(1Wa(ykP)JxRBqKv0trA*?GZ>GF&#Onc;S-e%FGUAgzi;&Wi zLviw)g3|hHdA#ZquZN3z-cR*Ju)NWw*xxXlGF31VlBTG=LbQ)}LcJTGY{1rg{W8YB z`N(Nh|Iv-qQ~+mWASzqK`ok?qpp!~uS}LzW6r?Tk>lSb46oyt0Mp7f84CW?!50tA#WS;dNmQ%5lqmK8)IH?=Q;k; zuBZS_a=cPYElzI&ksz`stg7x->#*uXqeSgu;yLzr!b&LJjAa`v1ipY}`gc)3L4))^ zoAnXfH+xvL@tBlPSj-JE_Q|gvh{|PY#fn01<#@wCg;7M%%g6?iQ@*4~5R^PHRV&v} z3&e}MXu+$xD8g{j2rGLRtmtV}&AUlhF6bgGt>QnhLKQn>Vm1>K#d>#kOWP>fE@JyS z51T_5SHVb|I7_PXA8NVK@6_6d(%dM~7D~OicxlIez{IR4$=6awpduxr=4nv5fw7iZ zrlwMEd0iuE#BAbim`|+O7}yIdUH=po%{-7n(R~!lGB}ap3%;d&U57291o&%S)Q%u} z+Ls28QrXF$ujEj!%+Y@745xV!BWeqaBg4I#36astK^J)M0rFKXLTTCpHa@~i&K zk*@<)E;1Ik(^XR<+183vOGS4aji`s2zy~=EA0D_hG44ak{FdZFQ!(QX`}>_7;|U43-cot}L52 zl?qMf^J2=a8mPDLuaKyVII+(wquGj2D?x@;)cl2A6OmlPbH7){Qq#~U<11qaB2!QX zGMQ3;8Yt^FR}{6Yx>r6ca&i6DtN;FUmPlpJi|bf4S1pg|*P?O_UEHaTF)plBs;5 zNaPZ+x)6*RI7M@R>*m&SJ~aug&d?fc8~J&(5aLgR7sDF0gr73Hj=dtsTNRICBhF1e ztlu5x@P|Pw?bAd@L9d~F(rPhjTb*e}tfPv|J`X%lC~6*-Q&um^{ZaqPU{ca!l+R{9 zJGBv&C+cQRENVhf(GR4`p8mqFwAbB*a?%1aqOzqKM=+)(Qf#!ej{^X0UQ46{I0*5l zX%#otfa*;|*UlTGGVtZN$BK<2{)(>gA+N*{wf)l`P8lX*gBpvzjqE1X68(92A$8P5<5v{EqG$DL zk?7nhLC*FUcSNl~Nrd0qekAL&Vb@=rGS+Wk_#t(gm;et6Prs@8kb;AU&E}g zsCrVhRQ`;20}g|~=)%PvJ9#2QYQC>{Vri(TwY?HV{>}X@WvW_eY2q!S|BOFMfZ!sD z@B=A~#p8xlyFiJYhiKTE$SioAkZJR$h;+Ji}bIwZuX)-kFdq;B30GZ^1IipLiCEL z*A?D6b6WfSYK|FsuHMdLRGEd0zpLn_e3E(`&c#v=>cs@i5GWLv*Gabu#=FbBR?FX3 zYxcq{!nNdY4EF9VvDCms?ncsc>`f-RH21kg*Q);=(QWVqv*zMW=fd%@OzNnq?u>}- zQ%KNs=E7yuchUGlJVt};_v+Dy(pNnCI<5sN?lK?gPsaK>lHSfABuUEk z7j?~S4fI~%3GcPti~D1GRNut+Y^jD+{WP??13J&0KHs$ z%SzYxc*+Me%JCU&kw75<)-~%7g&V^xwLCr84PQ&d;_HM@Wp5x}6f?k{8aGG0zCV`N z7PpURo)fU}cgfA2<7DjV7KKk0*4n%!ou{bq>DZ$i zl~{YjFhVW03m13nq`D=_7CJ_MS$mXIxpcb*)mv)7{8`J_peqW(0La?q(hb0*D*RY3ntrbPK$7U1{6YXfv zM?b$wQIqBZE{t)m4y;(~b(nyJxqptpdW#g(j`a|!^toR~_30uV}SohsZ(zN`=hR;`ZjmxVj>y#Bp$%Nh)XD+7SN*U1CVLd~`n!W;X!Selg9Ji_eD}iucWb`l|n?V+c7u?9JuP=;~ z=jHd!Zp#MidpwW#3Y@B1-l7%caldEf5b^xd1MSg97D-O?>6 zSB|^139-m*zk6lH=E#vK;`er*t)gG2jDJVaXHZ!X2oGuytj&152AMWA*bBV9Rf2a5 z@=i6&dwIXeQCD@ZER4)f#+v7zn0TzlE5Hx2n{OHMO7QFm90I%#3p`X3fnP9a0i6N- zfN*g+kD_ko1-uRT2^K-j`zXJOUE{^9#Y*r$fZcoV!dzBA#!_~!t}xzvbK{M(vf<3! z@d~G{E3hW#q21sF`;f20ESLOK>JZEf{QxFVo3Z|wU#D`O(XrT5s#wh*0zQS6J4wo9 z?8$j5;mNW6m@n>bY$5MIunS@`_QAg=cmA^&T#l8&qCIcUy?#{0vBP))Gx_fUe(f)7 zAI|^>;CXwga%gV(QQAq2n}A0Ic~ap8A$SGyd1T%M2zD2|X`uUR6;&eyR`@-u_VXX? zm`XAQ^0wtiUA z;{^`V7!M>tR)Q|*?+^aHKwvMxR@IC|?qSfn$}4&uei)Q{Ao7(+?`j<9o&5dO$7T_3%iRo+DRCHx-3~33fBL;q7 z$A?sh`a1+(@_Sa*ZMDSb7w)yq_ATSz6nM-)buTZM8yO%QJ$NA?czi%Ie=eG4ZtVzW z^>kqm@bTEa@if*XDQ*s1enzt&_PIWX-Rpm1s2Ju&t#C?Q?wIXZWO-LCH#Eo{`#CI{ zyOt^|UZ+tHe?o3>>uBEE@s43;UT^#Pf(%J*5M-s_v$9!hp*hl_3wSHGhWqJ83?PjX zyJ;rI7Qai$Tk`K6E%z!-<+Bb;261sOX|bjKwkfaH8g@2O(v=GfvbT!-5kv zN9t&iPfcKbX5Fq2H)=eN(S;K=N9ssX?@NW>7M00%IO>KIHAkxWi>6NqiaIX|@?VV_ zk7M-VM9q;Z+14?$3i~k#^luv5az`V_R9q{l=4H=AMRN_R< zkuEy43wKajqu$}DD^AoLsp0?ruofNK=k0;IZ{IcIj-Q>zk7_osbPb0j9Q$iIa1;C3-{Vage^T!MfH5Yd+CNo zj>$1PaiZo(h3#9n`-H-d0HS)o-?RFbO7C$v>VXqAN6mccf%fe{FUbA6iJ;|^NOcWp z9gehcqUNZXZN|!ch1~!IJ;#5`iboqgCdcR{jSlUOQOvx)_LETgselN4-|t!V$wrUK zF}iV@e`zMvuwiaH8f&#Jt|lb36&|2F8Kj_2Lc#$6Y;R z?dy#mm&4(vpAJr1K&Ep_=doGI^j$gUk!K=zRAk=L_QO1tD-{j`5ctUpI}Ci(37Vr> zaH2NK`SfGc$L{oU%Pk5g00^vB;Xw)XnZV;&FYp9(XJBs?j%2W`&}M-z_&uwxY1rT$ zju4!vjbi*o(`aWoIkHS7NcHe~Z|3&#eb5`ATsq5Y8XmaRohXDWp z05nNNK~$WmjY1^bva5Q*9UyNMVFpT@!uG(+!1JO!sqiQ9WbwMz)_dpATE4!~V|F;= Z^1n)knbg1LpQ-==002ovPDHLkV1kk8bd>-A diff --git a/docs/images/SciLifeLab_logo.svg b/docs/images/SciLifeLab_logo.svg index b8a44b794e..3602a3b855 100644 --- a/docs/images/SciLifeLab_logo.svg +++ b/docs/images/SciLifeLab_logo.svg @@ -12,18 +12,23 @@ version="1.1" id="svg2" xml:space="preserve" - width="309.41211" - height="100" - viewBox="0 0 309.41211 100" - sodipodi:docname="SciLifeLab.svg" - inkscape:export-filename="SciLifeLab.png" - inkscape:export-xdpi="96" - inkscape:export-ydpi="96" - inkscape:version="0.92.3 (2405546, 2018-03-11)">image/svg+xml \ No newline at end of file + inkscape:label="SciLifeLab_Logotype_Green_POS" + transform="matrix(1.3333333,0,0,-1.3333333,0,117.17813)"> \ No newline at end of file diff --git a/docs/input.md b/docs/input.md deleted file mode 100644 index c92c84d5e6..0000000000 --- a/docs/input.md +++ /dev/null @@ -1,238 +0,0 @@ -# Input Documentation - -## General information about the TSV files - -Input files for Sarek can be specified using a TSV (Tab Separated Values) file given to the `--input` command (Note, the delimiter is the tab (`\t`) character). -There are different kinds of TSV files that can be used as input, depending on the input files available (FASTQ, uBAM, BAM...). -For all possible TSV files, described in the next sections, here is an explanation of what the columns refer to: - -- `subject` designates the subject, it should be the ID of the subject, and it must be unique for each subject, but one subject can have multiple samples (e.g. -normal and tumor) -- `sex` are the sex chromosomes of the subject, (XX or XY) -- `status` is the status of the measured sample, (0 for Normal or 1 for Tumor) -- `sample` designates the sample, it should be the ID of the sample (it is possible to have more than one tumor sample for each subject, i.e. -a tumor and a relapse), it must be unique, but samples can have multiple lanes (which will later be merged) -- `lane` is used when the sample is multiplexed on several lanes, it must be unique for each lane in the same sample (but does not need to be the original lane name), and must contain at least one character -- `fastq1` is the path to the first pair of the FASTQ file -- `fastq2` is the path to the second pair of the FASTQ file -- `bam` is the path to the BAM file -- `bai` is the path to the BAM index file -- `recaltable` is the path to the recalibration table -- `mpileup` is the path to the mpileup file - -It is recommended to add the absolute path of the files, but relative path should also work. - -All examples are given for a normal/tumor pair. -If no tumors are listed in the TSV file, then the workflow will proceed as if it is a normal sample instead of a normal/tumor pair, producing the germline Variant Calling results only. - -Sarek will output results in a different directory for each sample. -If multiple samples are specified in the TSV file, Sarek will consider all files to be from different samples. -Multiple TSV files can be specified if the path is enclosed in quotes. - -Output from Variant Calling and/or Annotation will be in a specific directory for each sample (or normal/tumor pair if applicable). - -## Starting from the mapping step - -When starting from the mapping step (`--step mapping`), the first step of Sarek, the input can have three different forms: - -- A TSV file containing the sample metadata and the path to the paired-end FASTQ files. -- The path to a directory containing the FASTQ files -- A TSV file containing the sample metadata and the path to the unmapped BAM (uBAM) files. - -### Providing a TSV file with the path to FASTQ files - -The TSV file to start with the step mapping with paired-end FASTQs should contain the columns: - -`subject sex status sample lane fastq1 fastq2` - -In this sample for the normal case there are 3 read groups, and 2 for the tumor. - -| | | | | | | -|-|-|-|-|-|-| -|G15511|XX|0|C09DFN|C09DF_1|/path/to/C09DFACXX111207.1_1.fastq.gz|/path/to/C09DFACXX111207.1_2.fastq.gz| -|G15511|XX|0|C09DFN|C09DF_2|/path/to/C09DFACXX111207.2_1.fastq.gz|/path/to/C09DFACXX111207.2_2.fastq.gz| -|G15511|XX|0|C09DFN|C09DF_3|/path/to/C09DFACXX111207.3_1.fastq.gz|/path/to/C09DFACXX111207.3_2.fastq.gz| -|G15511|XX|1|D0ENMT|D0ENM_1|/path/to/D0ENMACXX111207.1_1.fastq.gz|/path/to/D0ENMACXX111207.1_2.fastq.gz| -|G15511|XX|1|D0ENMT|D0ENM_2|/path/to/D0ENMACXX111207.2_1.fastq.gz|/path/to/D0ENMACXX111207.2_2.fastq.gz| - -### Providing the path to a FASTQ directory - -Input files for Sarek can be specified using the path to a FASTQ directory given to the `--input` command only with the `mapping` step. - -```bash -nextflow run nf-core/sarek --input ... -``` - -#### Input FASTQ file name best practices - -The input folder, containing the FASTQ files for one subject (ID) should be organized into one sub-folder for every sample. -All FASTQ files for that sample should be collected here. - -```text -ID -+--sample1 -+------sample1_lib_flowcell-index_lane_R1_1000.fastq.gz -+------sample1_lib_flowcell-index_lane_R2_1000.fastq.gz -+------sample1_lib_flowcell-index_lane_R1_1000.fastq.gz -+------sample1_lib_flowcell-index_lane_R2_1000.fastq.gz -+--sample2 -+------sample2_lib_flowcell-index_lane_R1_1000.fastq.gz -+------sample2_lib_flowcell-index_lane_R2_1000.fastq.gz -+--sample3 -+------sample3_lib_flowcell-index_lane_R1_1000.fastq.gz -+------sample3_lib_flowcell-index_lane_R2_1000.fastq.gz -+------sample3_lib_flowcell-index_lane_R1_1000.fastq.gz -+------sample3_lib_flowcell-index_lane_R2_1000.fastq.gz -``` - -FASTQ filename structure: - -- `sample_lib_flowcell-index_lane_R1_1000.fastq.gz` and -- `sample_lib_flowcell-index_lane_R2_1000.fastq.gz` - -Where: - -- `sample` = sample id -- `lib` = identifier of library preparation -- `flowcell` = identifier of flow cell for the sequencing run -- `lane` = identifier of the lane of the sequencing run - -Read group information will be parsed from FASTQ file names according to this: - -- `RGID` = "sample_lib_flowcell_index_lane" -- `RGPL` = "Illumina" -- `PU` = sample -- `RGLB` = lib - -### Providing a TSV file with the paths to uBAM files - -The TSV file for starting the mapping from uBAM files should contain the columns: - -- `subject sex status sample lane bam` - -In this sample for the normal case there are 3 read groups, and 2 for the tumor. - -| | | | | | | -|-|-|-|-|-|-| -|G15511|XX|0|C09DFN|C09DF_1|/path/to/C09DFAC_1.bam| -|G15511|XX|0|C09DFN|C09DF_2|/path/to/C09DFAC_2.bam| -|G15511|XX|0|C09DFN|C09DF_3|/path/to/C09DFAC_3.bam| -|G15511|XX|1|D0ENMT|D0ENM_1|/path/to/D0ENMAC_1.bam| -|G15511|XX|1|D0ENMT|D0ENM_2|/path/to/D0ENMAC_2.bam| - -## Starting from the BAM prepare recalibration step - -To start from the preparation of the recalibration step (`--step prepare_recalibration`), a TSV file for a normal/tumor pair needs to be given as input containing the paths to the non recalibrated but already mapped BAM files. -The TSV needs to contain the following columns: - -- `subject sex status sample bam bai` - -The same way, if you have non recalibrated BAMs and their indexes, you should use a structure like: - -| | | | | | | -|-|-|-|-|-|-| -|G15511|XX|0|C09DFN|/path/to/G15511.C09DFN.md.bam|/path/to/G15511.C09DFN.md.bai| -|G15511|XX|1|D0ENMT|/path/to/G15511.D0ENMT.md.bam|/path/to/G15511.D0ENMT.md.bai| - -When starting Sarek from the mapping step, a TSV file is generated automatically after the `MarkDuplicates` process. -This TSV file is stored under `results/Preprocessing/TSV/duplicates_marked_no_table.tsv` and can be used to restart Sarek from the non-recalibrated BAM files. -Using the parameter `--step prepare_recalibration` will automatically take this file as input. - -Additionally, individual TSV files for each sample (`duplicates_marked_no_table_[SAMPLE].tsv`) can be found in the same directory. - -If `--skip_markduplicates` has been specified, the TSV file for this step will be slightly different: - -| | | | | | | -|-|-|-|-|-|-| -|G15511|XX|0|C09DFN|/path/to/G15511.C09DFN.bam|/path/to/G15511.C09DFN.bai| -|G15511|XX|1|D0ENMT|/path/to/G15511.D0ENMT.bam|/path/to/G15511.D0ENMT.bai| - -When starting Sarek from the mapping step with `--skip_markduplicates`, a TSV file is generated automatically after the `Mapping` processes. -This TSV file is stored under `results/Preprocessing/TSV/mapped.tsv` and can be used to restart Sarek from the non-recalibrated BAM files. -Using the parameter `--step recalibrate --skip_markduplicates` will automatically take this file as input. - -Additionally, individual TSV files for each sample (`mapped_[SAMPLE].tsv`) can be found in the same directory. - -## Starting from the BAM recalibration step - -To start from the recalibration step (`--step recalibrate`), a TSV file for a normal/tumor pair needs to be given as input containing the paths to the non recalibrated but already mapped BAM files. -The TSV needs to contain the following columns: - -- `subject sex status sample bam bai recaltable` - -The same way, if you have non recalibrated BAMs, their indexes and their recalibration tables, you should use a structure like: - -| | | | | | | | -|-|-|-|-|-|-|-| -|G15511|XX|0|C09DFN|/path/to/G15511.C09DFN.md.bam|/path/to/G15511.C09DFN.md.bai|/path/to/G15511.C09DFN.recal.table| -|G15511|XX|1|D0ENMT|/path/to/G15511.D0ENMT.md.bam|/path/to/G15511.D0ENMT.md.bai|/path/to/G15511.D0ENMT.recal.table| - -When starting Sarek from the mapping step, a TSV file is generated automatically after the `BaseRecalibrator` processes. -This TSV file is stored under `results/Preprocessing/TSV/duplicates_marked.tsv` and can be used to restart Sarek from the non-recalibrated BAM files. -Using `--step recalibrate` will automatically take this file as input. - -Additionally, individual TSV files for each sample (`duplicates_marked_[SAMPLE].tsv`) can be found in the same directory. - -If `--skip_markduplicates` has been specified, the TSV file for this step will be slightly different: - -| | | | | | | | -|-|-|-|-|-|-|-| -|G15511|XX|0|C09DFN|/path/to/G15511.C09DFN.bam|/path/to/G15511.C09DFN.bai|/path/to/G15511.C09DFN.recal.table| -|G15511|XX|1|D0ENMT|/path/to/G15511.D0ENMT.bam|/path/to/G15511.D0ENMT.bai|/path/to/G15511.D0ENMT.recal.table| - -When starting Sarek from the mapping step with `--skip_markduplicates`, a TSV file is generated automatically after the `BaseRecalibrator` processes. -This TSV file is stored under `results/Preprocessing/TSV/mapped_no_duplicates_marked.tsv` and can be used to restart Sarek from the non-recalibrated BAM files. -Using `--step recalibrate` will automatically take this file as input. - -Additionally, individual TSV files for each sample (`mapped_no_duplicates_marked_[SAMPLE].tsv`) can be found in the same directory. - -## Starting from the variant calling step - -A TSV file for a normal/tumor pair with recalibrated BAM files and their indexes can be provided to start Sarek from the variant calling step (`--step variantcalling`). -The TSV file should contain the columns: - -- `subject sex status sample bam bai` - -Here is an example for two samples from the same subject: - -| | | | | | | -|-|-|-|-|-|-| -|G15511|XX|0|C09DFN|/path/to/G15511.C09DFN.recal.bam|/path/to/G15511.C09DFN.recal.bai| -|G15511|XX|1|D0ENMT|/path/to/G15511.D0ENMT.recal.bam|/path/to/G15511.D0ENMT.recal.bai| - -When starting Sarek from the mapping or recalibrate steps, a TSV file is generated automatically after the recalibration processes. -This TSV file is stored under `results/Preprocessing/TSV/recalibrated.tsv` and can be used to restart Sarek from the recalibrated BAM files. -Using the parameter `--step variantcalling` will automatically take this file as input. - -Additionally, individual TSV files for each sample (`recalibrated_[SAMPLE].tsv`) can be found in the same directory. - -## Starting from the mpileup file with the Control-FREEC step - -To start from the Control-FREEC step (`--step Control-FREEC`), a TSV file for a normal/tumor pair needs to be given as input containing the paths to the mpileup files. -The TSV needs to contain the following columns: - -- `subject sex status sample mpileup` - -Here is an example for one normal/tumor pair from one subjects: - -| | | | | | -|-|-|-|-|-| -|G15511|XX|0|C09DFN|/path/to/G15511.C09DFN.pileup| -|G15511|XX|1|D0ENMT|/path/to/G15511.D0ENMT.pileup| - -When starting Sarek from the Control-FREEC step, a TSV file is generated automatically after the `mpileup` process. -This TSV file is stored under `results/VariantCalling/TSV/control-freec_mpileup.tsv` and can be used to restart Sarek from the mpileup files. -Using the parameter `--step Control-FREEC` will automatically take this file as input. - -Additionally, individual TSV files for each sample (`control-freec_mpileup_[SAMPLE].tsv`) can be found in the same directory. - -## VCF files for annotation - -Input files for Sarek can be specified using the path to a VCF directory given to the `--input` command only with the annotation step (`--step annotate`). -As Sarek will use `bgzip` and `tabix` to compress and index VCF files annotated, it expects VCF files to be sorted. -Multiple VCF files can be specified, using a [glob path](https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob), if enclosed in quotes. -For example: - -```bash -nextflow run nf-core/sarek --step annotate --input "results/VariantCalling/*/{HaplotypeCaller,Manta,Mutect2,Strelka,TIDDIT}/*.vcf.gz" ... -``` diff --git a/docs/install_bianca.md b/docs/install_bianca.md deleted file mode 100644 index 5418db21d1..0000000000 --- a/docs/install_bianca.md +++ /dev/null @@ -1,202 +0,0 @@ -# Installation on a secure cluster - -This small tutorial will explain to you how to install and run nf-core/sarek on a small sample test data on the Swedish UPPMAX cluster `bianca` made for sensitive data. -It can be followed to install on any similar secure cluster. - -For more information about `bianca`, follow the [`bianca` user guide](http://uppmax.uu.se/support/user-guides/bianca-user-guide/). -For more information about using Singularity with UPPMAX, follow the [Singularity UPPMAX guide](https://www.uppmax.uu.se/support-sv/user-guides/singularity-user-guide/). - -## Install Nextflow - -```bash -# Connect to rackham -> ssh -AX [USER]@rackham.uppmax.uu.se -# Or just open a terminal - -# Download the all Nextflow bundle -> wget https://github.com/nextflow-io/nextflow/releases/download/v[xx.yy.zz]/nextflow-[xx.yy.zz]-all - -# Send to bianca (here using sftp) -# For FileZilla follow the bianca user guide -> sftp [USER]-[PROJECT]@bianca-sftp.uppmax.uu.se:[USER]-[PROJECT] -> put nextflow-[xx.yy.zz]-all - -# Exit sftp -> exit - -# Connect to bianca -> ssh -A [USER]-[PROJECT]@bianca.uppmax.uu.se - -# Go to your project -> cd /castor/project/proj_nobackup - -# Make directory for Nextflow -> mkdir tools -> mkdir tools/nextflow - -# Move Nextflow from wharf to its directory -> mv /castor/project/proj_nobackup/wharf/[USER]/[USER]-[PROJECT]/nextflow-[xx.yy.zz]-all /castor/project/proj_nobackup/tools/nextflow - -# Establish permission -> chmod a+x /castor/project/proj_nobackup/tools/nextflow/nextflow-[xx.yy.zz]-all - -# If you want other people to use it -# Be sure that your group has rights to the directory as well - -> chown -R .[PROJECT] /castor/project/proj_nobackup/tools/nextflow/nextflow-[xx.yy.zz]-all - -# Make a link to it -> ln -s /castor/project/proj_nobackup/tools/nextflow/nextflow-[xx.yy.zz]-all /castor/project/proj_nobackup/tools/nextflow/nextflow - -# And everytime you're launching Nextflow, don't forget to export the following ENV variables -# Or add them to your .bashrc file -> export NXF_HOME=/castor/project/proj/nobackup/tools/nextflow/ -> export PATH=${NXF_HOME}:${PATH} -> export NXF_TEMP=$SNIC_TMP -> export NXF_LAUNCHER=$SNIC_TMP -> export NXF_SINGULARITY_CACHEDIR=/sw/data/uppnex/ToolBox/sarek -``` - -## Install nf-core/sarek - -nf-core/sarek use Singularity containers to package all the different tools. -All containers, and all Reference files are already stored on UPPMAX. - -As `bianca` is secure, no direct download is available, so nf-core/sarek will have to be installed and updated manually. - -You can either download nf-core/sarek on your computer or on `rackham`, make an archive, and send it to `bianca` using `FileZilla` or `sftp` given your preferences. - -```bash -# Connect to rackham -> ssh -AX [USER]@rackham.uppmax.uu.se -# Or just open a terminal - -# Clone the repository -> git clone https://github.com/nf-core/sarek.git - -# Go to the newly created directory -> cd sarek - -# It is also possible to checkout a specific version using -> git checkout - -# You also include the nf-core/test-datasets and nf-core/configs using git-archive-all -# Install pip -> curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py -> python get-pip.py - -# If it fails due to permission, you could consider using -> python get-pip.py --user - -# Install git-archive-all using pip -> pip install git-archive-all -# If you used --user before, you might want to do that here too -> pip install git-archive-all --user -> ./scripts/make_snapshot.sh --include-test-data --include-configs - -# Or you can just include nf-core/sarek: -> ./scripts/make_snapshot.sh - -# You will get this message in your terminal -Wrote sarek-[snapID].tar.gz - -# Send the archive to bianca (here using sftp) -# For FileZilla follow the bianca user guide -> sftp [USER]-[PROJECT]@bianca-sftp.uppmax.uu.se:[USER]-[PROJECT] -> put sarek-[snapID].tar.gz -> exit - -# The archive will be in the wharf folder in your user home on your bianca project - -# Connect to bianca -> ssh -A [USER]-[PROJECT]@bianca.uppmax.uu.se - -# Go to your project -> cd /castor/project/proj_nobackup - -# Make and go into a nf-core/sarek directoy (where you will store all nf-core/sarek versions) -> mkdir sarek -> cd sarek - -# Copy the tar from wharf to the project -> cp /castor/project/proj_nobackup/wharf/[USER]/[USER]-[PROJECT]/sarek-[snapID].tgz /castor/project/proj_nobackup/sarek - -# extract the archive. Also remember to extract the containers you uploaded. -> tar xvzf sarek-[snapID].tgz - -# If you want other people to use it, -# Be sure that your group has rights to the directory as well -> chown -R .[PROJECT] sarek-[snapID] - -# Make a symbolic link to the extracted repository -> ln -s sarek-[snapID] default -``` - -The principle is to have every member of your project to be able to use the same nf-core/sarek version at the same time. -So every member of the project who wants to use nf-core/sarek will need to do: - -```bash -# Connect to bianca -> ssh -A [USER]-[PROJECT]@bianca.uppmax.uu.se - -# Go to your user directory -> cd /home/[USER] - -# Make a symbolic link to the default nf-core/sarek -> ln -s /castor/project/proj_nobackup/sarek/default sarek -``` - -Singularity images for Sarek are available on Uppmax in `/sw/data/uppnex/ToolBox/sarek`. -Sometimes Nextflow needs write access to the image folder, and if so the images needs to be copied to a location with write permission, for example in a subfolder of your project folder. - -```bash -#Create a folder for the singularity images somewhere in your project: -mkdir sarek_simg - -#Copy the relevant singularity image from the write protected folder on Uppmax to the folder where you have write permission: -cp /sw/data/uppnex/ToolBox/sarek/nfcore-sarek-dev.img /path/to/your/sarek_simg/. - -#Update the ENV parameter NXF_SINGULARITY_CACHEDIR -export NXF_SINGULARITY_CACHEDIR=/path/to/your/sarek_simg -``` - -And then nf-core/sarek can be used with: - -```bash -> nextflow run ~/sarek/main.nf -profile uppmax --custom_config_base ~/sarek/configs --project [PROJECT] --genome [GENOME ASSEMBLY] ... -``` - -This command worked on Bianca 20190906: - -```bash ->screen -S SAMPLE /path/to/nextflow run /path/to/sarek/main.nf -profile uppmax --project PROJID --sample SAMPLE.tsv --genome GRCh37 --genomes_base /sw/data/uppnex/ToolBox/ReferenceAssemblies/hg38make/bundle/2.8/b37 --step variantcalling --tools ASCAT --igenomesIgnore - -#To detach screen: -ctrl-A-D -``` - -This is an example of how to run sarek with the tool Manta and the genome assembly version GRCh38: - -```bash -> nextflow run ~/sarek/main.nf -profile uppmax --custom_config_base ~/sarek/configs --project [PROJECT] --tools Manta --sample [SAMPLE.TSV] --genome GRCh38 -``` - -## Update nf-core/sarek - -Repeat the same steps as for installing nf-core/sarek, and once the tar has been extracted, you can replace the link. - -```bash -# Connect to bianca (Connect to rackham first if needed) -> ssh -A [USER]-[PROJECT]@bianca.uppmax.uu.se - -# Go to the sarek directory in your project -> cd /castor/project/proj_nobackup/sarek - -# Remove link -> rm default - -# Link to new nf-core/sarek version -> ln -s sarek-[NEWsnapID] default -``` - -You can for example keep a `default` version that you are sure is working, an make a link for a `testing` or `development` diff --git a/docs/output.md b/docs/output.md index 0e500afd00..ce5758a240 100644 --- a/docs/output.md +++ b/docs/output.md @@ -2,19 +2,25 @@ This document describes the output produced by the pipeline. +The directories listed below will be created in the results directory after the pipeline has finished. +All paths are relative to the top-level results directory. + ## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: - [Preprocessing](#preprocessing) - [Map to Reference](#map-to-reference) - - [BWA mem](#bwa-mem) + - [bwa](#bwa) + - [BWA-mem2](#bwa-mem2) - [Mark Duplicates](#mark-duplicates) - - [GATK MarkDuplicatesSpark](#gatk-markduplicatesspark) + - [GATK MarkDuplicates](#gatk-markduplicates) - [Base (Quality Score) Recalibration](#base-quality-score-recalibration) - [GATK BaseRecalibrator](#gatk-baserecalibrator) - [GATK ApplyBQSR](#gatk-applybqsr) - [TSV files](#tsv-files) + - [TSV files with `--skip_markduplicates`](#tsv-files-with---skip_markduplicates) + - [TSV files with `--sentieon`](#tsv-files-with---sentieon) - [Variant Calling](#variant-calling) - [SNVs and small indels](#snvs-and-small-indels) - [FreeBayes](#freebayes) @@ -43,7 +49,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [QC](#qc) - [FastQC](#fastqc) - [bamQC](#bamqc) - - [MarkDuplicates reports](#markduplicates-reports) + - [GATK MarkDuplicates reports](#gatk-markduplicates-reports) - [samtools stats](#samtools-stats) - [bcftools stats](#bcftools-stats) - [VCFtools](#vcftools) @@ -51,97 +57,123 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [VEP reports](#vep-reports) - [Reporting](#reporting) - [MultiQC](#multiqc) +- [Pipeline information](#pipeline-information) ## Preprocessing -Sarek preprocesses raw FastQ files or unmapped BAM files, based on [GATK best practices](https://software.broadinstitute.org/gatk/best-practices/). - -BAM files with Recalibration tables can also be used as an input to start with the recalibration of said BAM files, for more information see [TSV files output information](#tsv-files) +`Sarek` pre-processes raw `FASTQ` files or `unmapped BAM` files, based on [GATK best practices](https://gatk.broadinstitute.org/hc/en-us/sections/360007226651-Best-Practices-Workflows). ### Map to Reference -#### BWA mem +#### bwa -[BWA mem](http://bio-bwa.sourceforge.net/) is a software package for mapping low-divergent sequences against a large reference genome. +[bwa](https://github.com/lh3/bwa) is a software package for mapping low-divergent sequences against a large reference genome. Such files are intermediate and not kept in the final files delivered to users. -### Mark Duplicates +#### BWA-mem2 + +[BWA-mem2](https://github.com/bwa-mem2/bwa-mem2) is a software package for mapping low-divergent sequences against a large reference genome. -#### GATK MarkDuplicatesSpark +Such files are intermediate and not kept in the final files delivered to users. + +### Mark Duplicates -[GATK MarkDuplicatesSpark](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_spark_transforms_markduplicates_MarkDuplicatesSpark.php) is a Spark implementation of [Picard MarkDuplicates](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/picard_sam_markduplicates_MarkDuplicates.php) and locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA. +#### GATK MarkDuplicates -If the pipeline is run with the option `--no_gatk_spark` then [GATK MarkDuplicates](https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.4.0/picard_sam_markduplicates_MarkDuplicates.php) is used instead. +By default, `Sarek` will use [GATK MarkDuplicatesSpark](https://gatk.broadinstitute.org/hc/en-us/articles/360042912511-MarkDuplicatesSpark), `Spark` implementation of [GATK MarkDuplicates](https://gatk.broadinstitute.org/hc/en-us/articles/360042477492-MarkDuplicates-Picard), which locates and tags duplicate reads in a `BAM` or `SAM` file, where duplicate reads are defined as originating from a single fragment of DNA. -This directory is the location for the BAM files delivered to users. -Besides the duplicates-marked BAM files, the recalibration tables (`*.recal.table`) are also stored, and can be used to create base recalibrated files. +Specify `--no_gatk_spark` to use `GATK MarkDuplicates` instead. -For further reading and documentation see the [data pre-processing workflow from the GATK best practices](https://software.broadinstitute.org/gatk/best-practices/workflow?id=11165). +This directory is the location for the `BAM` files delivered to users. +Besides the `duplicates-marked BAM` files, the recalibration tables (`*.recal.table`) are also stored, and can be used to create `recalibrated BAM` files. For all samples: + **Output directory: `results/Preprocessing/[SAMPLE]/DuplicatesMarked`** - `[SAMPLE].md.bam` and `[SAMPLE].md.bai` - - BAM file and index + - `BAM` file and index + +For further reading and documentation see the [data pre-processing for variant discovery from the GATK best practices](https://gatk.broadinstitute.org/hc/en-us/articles/360035535912-Data-pre-processing-for-variant-discovery). ### Base (Quality Score) Recalibration #### GATK BaseRecalibrator -[GATK BaseRecalibrator](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_BaseRecalibrator.php) generates a recalibration table based on various covariates. +[GATK BaseRecalibrator](https://gatk.broadinstitute.org/hc/en-us/articles/360042477672-BaseRecalibrator) generates a recalibration table based on various co-variates. For all samples: + **Output directory: `results/Preprocessing/[SAMPLE]/DuplicatesMarked`** - `[SAMPLE].recal.table` - - Recalibration Table associated to the duplicates marked BAMs. + - Recalibration table associated to the `duplicates-marked BAM` file. #### GATK ApplyBQSR -[GATK ApplyBQSR](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_ApplyBQSR.php) recalibrates the base qualities of the input reads based on the recalibration table produced by the [`BaseRecalibrator`](#gatk-baserecalibrator) tool. +[GATK ApplyBQSR](https://gatk.broadinstitute.org/hc/en-us/articles/360042476852-ApplyBQSR) recalibrates the base qualities of the input reads based on the recalibration table produced by the [GATK BaseRecalibrator](#gatk-baserecalibrator) tool. -This directory is usually empty, it is the location for the final recalibrated BAM files. -Recalibrated BAM files are usually 2-3 times larger than the duplicates-marked BAM files. -To re-generate recalibrated BAM file you have to apply the recalibration table delivered to the `DuplicatesMarked` directory either within Sarek, or doing this recalibration step yourself. - -For further reading and documentation see the [data pre-processing workflow from the GATK best practices](https://software.broadinstitute.org/gatk/best-practices/workflow?id=11165). +This directory is the location for the final `recalibrated BAM` files. +`Recalibrated BAM` files are usually 2-3 times larger than the `duplicates-marked BAM` files. +To re-generate `recalibrated BAM` file you have to apply the recalibration table delivered to the `DuplicatesMarked\` folder either using `Sarek` ( [`--step recalibrate`](usage.md#step-recalibrate) ) , or doing this recalibration yourself. For all samples: + **Output directory: `results/Preprocessing/[SAMPLE]/Recalibrated`** - `[SAMPLE].recal.bam` and `[SAMPLE].recal.bam.bai` - - BAM file and index + - `BAM` file and index + +For further reading and documentation see the [data pre-processing for variant discovery from the GATK best practices](https://gatk.broadinstitute.org/hc/en-us/articles/360035535912-Data-pre-processing-for-variant-discovery). ### TSV files -The TSV files are auto-generated and can be used by Sarek for further processing and/or variant calling. +The `TSV` files are auto-generated and can be used by `Sarek` for further processing and/or variant calling. -For further reading and documentation see the [input documentation](https://github.com/nf-core/sarek/blob/master/docs/input.md). +For further reading and documentation see the [`--input`](usage.md#--input) section in the usage documentation. For all samples: + **Output directory: `results/Preprocessing/TSV`** - `duplicates_marked_no_table.tsv`, `duplicates_marked.tsv` and `recalibrated.tsv` - - TSV files to start Sarek from `prepare_recalibration`, `recalibrate` or `variantcalling` steps. -- `duplicates_marked_no_table_[SAMPLE].tsv` `duplicates_marked_[SAMPLE].tsv` and `recalibrated_[SAMPLE].tsv` - - TSV files to start Sarek from `prepare_recalibration`, `recalibrate` or `variantcalling` steps for a specific sample. + - `TSV` files to start `Sarek` from `prepare_recalibration`, `recalibrate` or `variantcalling` steps. +- `duplicates_marked_no_table_[SAMPLE].tsv`, `duplicates_marked_[SAMPLE].tsv` and `recalibrated_[SAMPLE].tsv` + - `TSV` files to start `Sarek` from `prepare_recalibration`, `recalibrate` or `variantcalling` steps for a specific sample. -> `/!\` Only with [`--sentieon`](usage.md#--sentieon) +### TSV files with `--skip_markduplicates` + +> **WARNING** Only with [`--skip_markduplicates`](usage.md#--skip_markduplicates) For all samples: + +**Output directory: `results/Preprocessing/TSV`** + +- `mapped.tsv`, `mapped_no_duplicates_marked.tsv` and `recalibrated.tsv` + - `TSV` files to start `Sarek` from `prepare_recalibration`, `recalibrate` or `variantcalling` steps. +- `mapped_[SAMPLE].tsv`, `mapped_no_duplicates_marked_[SAMPLE].tsv` and `recalibrated_[SAMPLE].tsv` + - `TSV` files to start `Sarek` from `prepare_recalibration`, `recalibrate` or `variantcalling` steps for a specific sample. + +### TSV files with `--sentieon` + +> **WARNING** Only with [`--sentieon`](usage.md#--sentieon) + +For all samples: + **Output directory: `results/Preprocessing/TSV`** - `sentieon_deduped.tsv` and `recalibrated_sentieon.tsv` - - TSV files to start Sarek from `variantcalling` step. + - `TSV` files to start `Sarek` from `variantcalling` step. - `sentieon_deduped_[SAMPLE].tsv` and `recalibrated_sentieon_[SAMPLE].tsv` - - TSV files to start Sarek from `variantcalling` step for a specific sample. + - `TSV` files to start `Sarek` from `variantcalling` step for a specific sample. ## Variant Calling -All the results regarding Variant Calling are collected in this directory. If some results from a variant caller do not appear here, please check out the [Variant calling](./variant_calling.md) documentation. +All the results regarding Variant Calling are collected in this directory. +If some results from a variant caller do not appear here, please check out the [`--tools`](usage.md#--tools) section in the usage documentation. -Recalibrated BAM files can also be used as an input to start the Variant Calling, for more information see [TSV files output information](#tsv-files) +`Recalibrated BAM` files can used as an input to start the Variant Calling. ### SNVs and small indels @@ -149,146 +181,158 @@ Recalibrated BAM files can also be used as an input to start the Variant Calling [FreeBayes](https://github.com/ekg/freebayes) is a Bayesian genetic variant detector designed to find small polymorphisms, specifically SNPs, indels, MNPs, and complex events smaller than the length of a short-read sequencing alignment. -For further reading and documentation see the [FreeBayes manual](https://github.com/ekg/freebayes/blob/master/README.md#user-manual-and-guide). - For all samples: + **Output directory: `results/VariantCalling/[SAMPLE]/FreeBayes`** - `FreeBayes_[SAMPLE].vcf.gz` and `FreeBayes_[SAMPLE].vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index -#### GATK HaplotypeCaller +For further reading and documentation see the [FreeBayes manual](https://github.com/ekg/freebayes/blob/master/README.md#user-manual-and-guide). -[GATK HaplotypeCaller](https://github.com/broadinstitute/gatk) calls germline SNPs and indels via local re-assembly of haplotypes. +#### GATK HaplotypeCaller -Germline calls are provided for all samples, to able comparison of both tumor and normal for possible mixup. +[GATK HaplotypeCaller](https://gatk.broadinstitute.org/hc/en-us/articles/360042913231-HaplotypeCaller) calls germline SNPs and indels via local re-assembly of haplotypes. -For further reading and documentation see the [HaplotypeCaller manual](https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.2.0/org_broadinstitute_hellbender_tools_walkers_haplotypecaller_HaplotypeCaller.php). +Germline calls are provided for all samples, to enable comparison of both, tumor and normal, for possible mixup. For all samples: + **Output directory: `results/VariantCalling/[SAMPLE]/HaploTypeCaller`** - `HaplotypeCaller_[SAMPLE].vcf.gz` and `HaplotypeCaller_[SAMPLE].vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index -#### GATK GenotypeGVCFs +For further reading and documentation see the [HaplotypeCaller manual](https://gatk.broadinstitute.org/hc/en-us/articles/360042913231-HaplotypeCaller). -[GATK GenotypeGVCFs](https://github.com/broadinstitute/gatk) performs joint genotyping on one or more samples pre-called with HaplotypeCaller. +#### GATK GenotypeGVCFs -Germline calls are provided for all samples, to able comparison of both tumor and normal for possible mixup. +[GATK GenotypeGVCFs](https://gatk.broadinstitute.org/hc/en-us/articles/360042914991-GenotypeGVCFs) performs joint genotyping on one or more samples pre-called with HaplotypeCaller. -For further reading and documentation see the [GenotypeGVCFs manual](https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.2.0/org_broadinstitute_hellbender_tools_walkers_GenotypeGVCFs.php). +Germline calls are provided for all samples, to enable comparison of both, tumor and normal, for possible mixup. For all samples: + **Output directory: `results/VariantCalling/[SAMPLE]/HaplotypeCallerGVCF`** - `HaplotypeCaller_[SAMPLE].g.vcf.gz` and `HaplotypeCaller_[SAMPLE].g.vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index + +For further reading and documentation see the [GenotypeGVCFs manual](https://gatk.broadinstitute.org/hc/en-us/articles/360042914991-GenotypeGVCFs). #### GATK Mutect2 -[GATK Mutect2](https://github.com/broadinstitute/gatk) calls somatic SNVs and indels via local assembly of haplotypes. +[GATK Mutect2](https://gatk.broadinstitute.org/hc/en-us/articles/360042477952-Mutect2) calls somatic SNVs and indels via local assembly of haplotypes. + +For further reading and documentation see the [Mutect2 manual](https://gatk.broadinstitute.org/hc/en-us/articles/360042477952-Mutect2). +It is recommended to have [panel of normals (PON)](https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON) for this version of `GATK Mutect2` using at least 40 normal samples. +Additionally, you can add your `PON` file to get filtered somatic calls. -For further reading and documentation see the [Mutect2 manual](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_mutect_Mutect2.php). -It is recommended to have panel of normals [PON](https://gatkforums.broadinstitute.org/gatk/discussion/11136/how-to-call-somatic-mutations-using-gatk4-mutect2) for this version of Mutect2 using at least 40 normal samples, and you can add your PON file to get filtered somatic calls. +For a Tumor/Normal pair: -For a Tumor/Normal pair only: **Output directory: `results/VariantCalling/[TUMOR_vs_NORMAL]/Mutect2`** Files created: - `Mutect2_unfiltered_[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz` and `Mutect2_unfiltered_[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz.tbi` - - unfiltered (raw) Mutect2 calls VCF with Tabix index + - unfiltered (raw) Mutect2 calls `VCF` with Tabix index - `Mutect2_filtered_[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz` and `Mutect2_filtered_[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz.tbi` - - filtered Mutect2 calls VCF with Tabix index: these entries has a PASS filter, you can get these when supplying a panel of normals using the `--pon` option + - filtered Mutect2 calls `VCF` with Tabix index: these entries have a `PASS` filter, you can get these when supplying a panel of normals using the `--pon` option - `[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz.stats` - - a stats file generated during calling raw variants (needed for filtering) + - a stats file generated during calling of raw variants (needed for filtering) - `[TUMORSAMPLE]_contamination.table` - - a text file exported when panel-of-normals provided about sample contamination + - a text file exported when panel-of-normals about sample contamination are provided #### samtools mpileup -[samtools mpileup](https://www.htslib.org/doc/samtools.html) generate pileup for a BAM file. - -For further reading and documentation see the [samtools manual](https://www.htslib.org/doc/samtools.html#COMMANDS_AND_OPTIONS). +[samtools mpileup](https://www.htslib.org/doc/samtools.html) generates pileup of a `BAM` file. For all samples: + **Output directory: `results/VariantCalling/[SAMPLE]/mpileup`** - `[SAMPLE].pileup.gz` - - The pileup format is a text-based format for summarizing the base calls of aligned reads to a reference sequence. Alignment records are grouped by sample (SM) identifiers in @RG header lines. + - The pileup format is a text-based format for summarizing the base calls of aligned reads to a reference sequence. Alignment records are grouped by sample (`SM`) identifiers in `@RG` header lines. + +For further reading and documentation see the [samtools manual](https://www.htslib.org/doc/samtools.html#COMMANDS_AND_OPTIONS). #### Strelka2 [Strelka2](https://github.com/Illumina/strelka) is a fast and accurate small variant caller optimized for analysis of germline variation in small cohorts and somatic variation in tumor/normal sample pairs. -For further reading and documentation see the [Strelka2 user guide](https://github.com/Illumina/strelka/blob/v2.9.x/docs/userGuide/README.md). - For all samples: + **Output directory: `results/VariantCalling/[SAMPLE]/Strelka`** - `Strelka_Sample_genome.vcf.gz` and `Strelka_Sample_genome.vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index - `Strelka_Sample_variants.vcf.gz` and `Strelka_Sample_variants.vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index For a Tumor/Normal pair: + **Output directory: `results/VariantCalling/[TUMOR_vs_NORMAL]/Strelka`** - `Strelka_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_indels.vcf.gz` and `Strelka_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_indels.vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index - `Strelka_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_snvs.vcf.gz` and `Strelka_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_snvs.vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index + +Using [Strelka Best Practices](https://github.com/Illumina/strelka/blob/master/docs/userGuide/README.md#somatic-configuration-example) with the `candidateSmallIndels` from `Manta`: -Using [Strelka Best Practices](https://github.com/Illumina/strelka/blob/v2.9.x/docs/userGuide/README.md#somatic-configuration-example) with the `candidateSmallIndels` from `Manta`: **Output directory: `results/VariantCalling/[TUMOR_vs_NORMAL]/Strelka`** - `StrelkaBP_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_indels.vcf.gz` and `StrelkaBP_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_indels.vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index - `StrelkaBP_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_snvs.vcf.gz` and `StrelkaBP_[TUMORSAMPLE]_vs_[NORMALSAMPLE]_somatic_snvs.vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index + +For further reading and documentation see the [Strelka2 user guide](https://github.com/Illumina/strelka/blob/master/docs/userGuide/README.md). #### Sentieon DNAseq -> `/!\` Only with [`--sentieon`](usage.md#--sentieon) +> **WARNING** Only with [`--sentieon`](usage.md#--sentieon) [Sentieon DNAseq](https://www.sentieon.com/products/#dnaseq) implements the same mathematics used in the Broad Institute's BWA-GATK HaplotypeCaller 3.3-4.1 Best Practices Workflow pipeline. -For further reading and documentation see the [Sentieon DNAseq user guide](https://support.sentieon.com/manual/DNAseq_usage/dnaseq/). - For all samples: + **Output directory: `results/VariantCalling/[SAMPLE]/SentieonDNAseq`** - `DNAseq_Sample.vcf.gz` and `DNAseq_Sample.vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index + +For further reading and documentation see the [Sentieon DNAseq user guide](https://support.sentieon.com/manual/DNAseq_usage/dnaseq/). #### Sentieon DNAscope -> `/!\` Only with [`--sentieon`](usage.md#--sentieon) +> **WARNING** Only with [`--sentieon`](usage.md#--sentieon) [Sentieon DNAscope](https://www.sentieon.com/products) calls SNPs and small indels. -For further reading and documentation see the [Sentieon DNAscope user guide](https://support.sentieon.com/manual/DNAscope_usage/dnascope/). - For all samples: + **Output directory: `results/VariantCalling/[SAMPLE]/SentieonDNAscope`** - `DNAscope_Sample.vcf.gz` and `DNAscope_Sample.vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index + +For further reading and documentation see the [Sentieon DNAscope user guide](https://support.sentieon.com/manual/DNAscope_usage/dnascope/). #### Sentieon TNscope -> `/!\` Only with [`--sentieon`](usage.md#--sentieon) +> **WARNING** Only with [`--sentieon`](usage.md#--sentieon) [Sentieon TNscope](https://www.sentieon.com/products/#tnscope) calls SNPs and small indels on an Tumor/Normal pair. -For further reading and documentation see the [Sentieon TNscope user guide](https://support.sentieon.com/manual/TNscope_usage/tnscope/). - For a Tumor/Normal pair: + **Output directory: `results/VariantCalling/[TUMOR_vs_NORMAL]/SentieonTNscope`** - `TNscope_[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz` and `TNscope_[TUMORSAMPLE]_vs_[NORMALSAMPLE].vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index + +For further reading and documentation see the [Sentieon TNscope user guide](https://support.sentieon.com/manual/TNscope_usage/tnscope/). ### Structural Variants @@ -296,86 +340,92 @@ For a Tumor/Normal pair: [Manta](https://github.com/Illumina/manta) calls structural variants (SVs) and indels from mapped paired-end sequencing reads. It is optimized for analysis of germline variation in small sets of individuals and somatic variation in tumor/normal sample pairs. -`Manta` provides a candidate list for small indels also that can be fed to `Strelka` following [Strelka Best Practices](https://github.com/Illumina/strelka/blob/v2.9.x/docs/userGuide/README.md#somatic-configuration-example). - -For further reading and documentation see the [Manta user guide](https://github.com/Illumina/manta/blob/master/docs/userGuide/README.md). +`Manta` provides a candidate list for small indels that can be fed to `Strelka` following [Strelka Best Practices](https://github.com/Illumina/strelka/blob/master/docs/userGuide/README.md#somatic-configuration-example). For all samples: + **Output directory: `results/VariantCalling/[SAMPLE]/Manta`** - `Manta_[SAMPLE].candidateSmallIndels.vcf.gz` and `Manta_[SAMPLE].candidateSmallIndels.vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index - `Manta_[SAMPLE].candidateSV.vcf.gz` and `Manta_[SAMPLE].candidateSV.vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index For Normal sample only: - `Manta_[NORMALSAMPLE].diploidSV.vcf.gz` and `Manta_[NORMALSAMPLE].diploidSV.vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index For a Tumor sample only: - `Manta_[TUMORSAMPLE].tumorSV.vcf.gz` and `Manta_[TUMORSAMPLE].tumorSV.vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index + +For a Tumor/Normal pair: -For a Tumor/Normal pair only: **Output directory: `results/VariantCalling/[TUMOR_vs_NORMAL]/Manta`** - `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].candidateSmallIndels.vcf.gz` and `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].candidateSmallIndels.vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index - `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].candidateSV.vcf.gz` and `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].candidateSV.vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index - `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].diploidSV.vcf.gz` and `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].diploidSV.vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index - `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].somaticSV.vcf.gz` and `Manta_[TUMORSAMPLE]_vs_[NORMALSAMPLE].somaticSV.vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index + +For further reading and documentation see the [Manta user guide](https://github.com/Illumina/manta/blob/master/docs/userGuide/README.md). #### TIDDIT [TIDDIT](https://github.com/SciLifeLab/TIDDIT) identifies intra and inter-chromosomal translocations, deletions, tandem-duplications and inversions. -Germline calls are provided for all samples, to able comparison of both tumor and normal for possible mixup. -Low quality calls are removed internally, to simplify processing of variant calls but they are saved by Sarek. - -For further reading and documentation see the [TIDDIT manual](https://github.com/SciLifeLab/TIDDIT/blob/master/README.md). +Germline calls are provided for all samples, to enable comparison of both, tumor and normal, for possible mixup. +Low quality calls are removed internally, to simplify processing of variant calls but they are saved by `Sarek`. For all samples: + **Output directory: `results/VariantCalling/[SAMPLE]/TIDDIT`** - `TIDDIT_[SAMPLE].vcf.gz` and `TIDDIT_[SAMPLE].vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index - `TIDDIT_[SAMPLE].signals.tab` - tab file describing coverage across the genome, binned per 50 bp - `TIDDIT_[SAMPLE].ploidy.tab` - tab file describing the estimated ploidy and coverage across each contig - `TIDDIT_[SAMPLE].old.vcf` - - VCF including the low qualiy calls + - `VCF` including the low qualiy calls - `TIDDIT_[SAMPLE].wig` - wiggle file containing coverage across the genome, binned per 50 bp - `TIDDIT_[SAMPLE].gc.wig` - wiggle file containing fraction of gc content, binned per 50 bp +For further reading and documentation see the [TIDDIT manual](https://github.com/SciLifeLab/TIDDIT/blob/master/README.md). + #### Sentieon DNAscope SV -> `/!\` Only with [`--sentieon`](usage.md#--sentieon) +> **WARNING** Only with [`--sentieon`](usage.md#--sentieon) [Sentieon DNAscope](https://www.sentieon.com/products) can perform structural variant calling in addition to calling SNPs and small indels. -For further reading and documentation see the [Sentieon DNAscope user guide](https://support.sentieon.com/manual/DNAscope_usage/dnascope/). - For all samples: + **Output directory: `results/VariantCalling/[SAMPLE]/SentieonDNAscope`** - `DNAscope_SV_Sample.vcf.gz` and `DNAscope_SV_Sample.vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index + +For further reading and documentation see the [Sentieon DNAscope user guide](https://support.sentieon.com/manual/DNAscope_usage/dnascope/). ### Sample heterogeneity, ploidy and CNVs #### ConvertAlleleCounts -[ConvertAlleleCounts](https://github.com/nf-core/sarek/blob/master/bin/convertAlleleCounts.r) is a R-script for converting output from AlleleCount to BAF and LogR values. +Running ASCAT on NGS data requires that the `BAM` files are converted into BAF and LogR values. +This can be done using the software [AlleleCount](https://github.com/cancerit/alleleCount) followed by the provided [ConvertAlleleCounts](https://github.com/nf-core/sarek/blob/master/bin/convertAlleleCounts.r) R-script. + +For a Tumor/Normal pair: -For a Tumor/Normal pair only: **Output directory: `results/VariantCalling/[TUMOR_vs_NORMAL]/ASCAT`** - `[TUMORSAMPLE].BAF` and `[NORMALSAMPLE].BAF` @@ -385,12 +435,13 @@ For a Tumor/Normal pair only: #### ASCAT -[ASCAT](https://github.com/Crick-CancerGenomics/ascat) is a method to derive copy number profiles of tumor cells, accounting for normal cell admixture and tumor aneuploidy. -ASCAT infers tumor purity and ploidy and calculates whole-genome allele-specific copy number profiles. +[ASCAT](https://github.com/Crick-CancerGenomics/ascat) is a software for performing allele-specific copy number analysis of tumor samples and for estimating tumor ploidy and purity (normal contamination). +It infers tumor purity and ploidy and calculates whole-genome allele-specific copy number profiles. +`ASCAT` is written in `R` and available here: [github.com/Crick-CancerGenomics/ascat](https://github.com/Crick-CancerGenomics/ascat). +The `ASCAT` process gives several images as output, described in detail in this [book chapter](http://www.ncbi.nlm.nih.gov/pubmed/22130873). -For further reading and documentation see [the Sarek documentation about ASCAT](https://github.com/nf-core/sarek/blob/master/docs/ascat.md) or the [ASCAT manual](https://www.crick.ac.uk/research/labs/peter-van-loo/software). +For a Tumor/Normal pair: -For a Tumor/Normal pair only: **Output directory: `results/VariantCalling/[TUMOR_vs_NORMAL]/ASCAT`** - `[TUMORSAMPLE].aberrationreliability.png` @@ -412,15 +463,27 @@ For a Tumor/Normal pair only: - `[TUMORSAMPLE].purityploidy.txt` - file with information about purity ploidy +The text file `[TUMORSAMPLE].cnvs.txt` countains predictions about copy number state for all the segments. +The output is a tab delimited text file with the following columns: + +- *chr*: chromosome number +- *startpos*: start position of the segment +- *endpos*: end position of the segment +- *nMajor*: number of copies of one of the allels (for example the chromosome inherited from the father) +- *nMinor*: number of copies of the other allele (for example the chromosome inherited of the mother) + +The file `[TUMORSAMPLE].cnvs.txt` contains all segments predicted by ASCAT, both those with normal copy number (nMinor = 1 and nMajor =1) and those corresponding to copy number aberrations. + +For further reading and documentation see the [ASCAT manual](https://www.crick.ac.uk/research/labs/peter-van-loo/software). + #### Control-FREEC -[Control-FREEC](https://github.com/BoevaLab/FREEC) is a tool for detection of copy-number changes and allelic imbalances (including LOH) using deep-sequencing data. -Control-FREEC automatically computes, normalizes, segments copy number and beta allele frequency profiles, then calls copy number alterations and LOH. -And also detects subclonal gains and losses and evaluate the likeliest average ploidy of the sample. +[Control-FREEC](https://github.com/BoevaLab/FREEC) is a tool for detection of copy-number changes and allelic imbalances (including loss of heterozygoity (LOH)) using deep-sequencing data. +`Control-FREEC` automatically computes, normalizes, segments copy number and beta allele frequency profiles, then calls copy number alterations and LOH. +And also detects subclonal gains and losses and evaluate the most likely average ploidy of the sample. -For further reading and documentation see the [Control-FREEC manual](http://boevalab.com/FREEC/tutorial.html). +For a Tumor/Normal pair: -For a Tumor/Normal pair only: **Output directory: `results/VariantCalling/[TUMOR_vs_NORMAL]/ControlFREEC`** - `[TUMORSAMPLE]_vs_[NORMALSAMPLE].config.txt` @@ -432,83 +495,81 @@ For a Tumor/Normal pair only: - `[TUMORSAMPLE].pileup.gz_BAF.txt` and `[NORMALSAMPLE].pileup.gz_BAF.txt` - file with beta allele frequencies for each possibly heterozygous SNP position +For further reading and documentation see the [Control-FREEC manual](http://boevalab.com/FREEC/tutorial.html). + ### MSI status -[Microsatellite instability](https://en.wikipedia.org/wiki/Microsatellite_instability) -is a genetic condition associated to deficiencies in the -mismatch repair (MMR) system which causes a tendency to accumulate a high -number of mutations (SNVs and indels). +[Microsatellite instability](https://en.wikipedia.org/wiki/Microsatellite_instability) is a genetic condition associated to deficiencies in the mismatch repair (MMR) system which causes a tendency to accumulate a high number of mutations (SNVs and indels). +An altered distribution of microsatellite length is associated to a missed replication slippage which would be corrected under normal MMR conditions. #### MSIsensor -[MSIsensor](https://github.com/ding-lab/msisensor) is a tool to detect the MSI -status of a tumor scanning the length of the microsatellite regions. An altered -distribution of microsatellite length is associated to a missed replication -slippage which would be corrected under normal mismatch repair (MMR) conditions. It requires -a normal sample for each tumour to differentiate the somatic and germline -cases. +[MSIsensor](https://github.com/ding-lab/msisensor) is a tool to detect the MSI status of a tumor scanning the length of the microsatellite regions. +It requires a normal sample for each tumour to differentiate the somatic and germline cases. -For further reading see the [MSIsensor paper](https://www.ncbi.nlm.nih.gov/pubmed/24371154). +For a Tumor/Normal pair: -For a Tumor/Normal pair only: **Output directory: `results/VariantCalling/[TUMORSAMPLE]_vs_[NORMALSAMPLE]/MSIsensor`** -- `[TUMORSAMPLE]_vs_[NORMALSAMPLE]`_msisensor +- `[TUMORSAMPLE]_vs_[NORMALSAMPLE]_msisensor` - MSI score output, contains information about the number of somatic sites. -- `[TUMORSAMPLE]_vs_[NORMALSAMPLE]`_msisensor_dis +- `[TUMORSAMPLE]_vs_[NORMALSAMPLE]_msisensor_dis` - The normal and tumor length distribution for each microsatellite position. -- `[TUMORSAMPLE]_vs_[NORMALSAMPLE]`_msisensor_germline - - somatic sites detected -- `[TUMORSAMPLE]_vs_[NORMALSAMPLE]`_msisensor_somatic - - germ line sites detected +- `[TUMORSAMPLE]_vs_[NORMALSAMPLE]_msisensor_germline` + - Somatic sites detected. +- `[TUMORSAMPLE]_vs_[NORMALSAMPLE]_msisensor_somatic` + - Germline sites detected. + +For further reading see the [MSIsensor paper](https://www.ncbi.nlm.nih.gov/pubmed/24371154). ## Variant annotation -This directory contains results from the final annotation steps: two software are used for annotation, [snpEff](http://snpeff.sourceforge.net/) and [VEP](https://www.ensembl.org/info/docs/tools/vep/index.html). -Only a subset of the VCF files are annotated, and only variants that have a PASS filter. -FreeBayes results are not annotated in the moment yet as we are lacking a decent somatic filter. -For HaplotypeCaller the germline variations are annotated for both the tumor and the normal sample. +This directory contains results from the final annotation steps: two tools are used for annotation, [snpEff](http://snpeff.sourceforge.net/) and [VEP](https://www.ensembl.org/info/docs/tools/vep/index.html). +Only a subset of the `VCF` files are annotated, and only variants that have a `PASS` filter. +Currently, `FreeBayes` results are not annotated as we are lacking a decent somatic filter. ### snpEff [snpeff](http://snpeff.sourceforge.net/) is a genetic variant annotation and effect prediction toolbox. It annotates and predicts the effects of variants on genes (such as amino acid changes) using multiple databases for annotations. -The generated VCF header contains the software version and the used command line. - -For further reading and documentation see the [snpEff manual](http://snpeff.sourceforge.net/SnpEff_manual.html#outputSummary) +The generated `VCF` header contains the software version and the used command line. For all samples: + **Output directory: `results/Annotation/[SAMPLE]/snpEff`** - `VariantCaller_Sample_snpEff.ann.vcf.gz` and `VariantCaller_Sample_snpEff.ann.vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index -### VEP +For further reading and documentation see the [snpEff manual](http://snpeff.sourceforge.net/SnpEff_manual.html#outputSummary) -[VEP (Variant Effect Predictor)](https://www.ensembl.org/info/docs/tools/vep/index.html), based on Ensembl, is a tools to determine the effects of all sorts of variants, including SNPs, indels, structural variants, CNVs. -The generated VCF header contains the software version, also the version numbers for additional databases like Clinvar or dbSNP used in the "VEP" line. -The format of the [consequence annotations](https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html) is also in the VCF header describing the INFO field. -In the moment it contains: - -- Consequence: impact of the variation, if there is any -- Codons: the codon change, i.e. cGt/cAt -- Amino_acids: change in amino acids, i.e. R/H if there is any -- Gene: ENSEMBL gene name -- SYMBOL: gene symbol -- Feature: actual transcript name -- EXON: affected exon -- PolyPhen: prediction based on [PolyPhen](http://genetics.bwh.harvard.edu/pph2/) -- SIFT: prediction by [SIFT](http://sift.bii.a-star.edu.sg/) -- Protein_position: Relative position of amino acid in protein -- BIOTYPE: Biotype of transcript or regulatory feature +### VEP -For further reading and documentation see the [VEP manual](https://www.ensembl.org/info/docs/tools/vep/index.html) +[VEP (Variant Effect Predictor)](https://www.ensembl.org/info/docs/tools/vep/index.html), based on `Ensembl`, is a tool to determine the effects of all sorts of variants, including SNPs, indels, structural variants, CNVs. +The generated `VCF` header contains the software version, also the version numbers for additional databases like `Clinvar` or `dbSNP` used in the `VEP` line. +The format of the [consequence annotations](https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html) is also in the `VCF` header describing the `INFO` field. +Currently, it contains: + +- *Consequence*: impact of the variation, if there is any +- *Codons*: the codon change, i.e. cGt/cAt +- *Amino_acids*: change in amino acids, i.e. R/H if there is any +- *Gene*: ENSEMBL gene name +- *SYMBOL*: gene symbol +- *Feature*: actual transcript name +- *EXON*: affected exon +- *PolyPhen*: prediction based on [PolyPhen](http://genetics.bwh.harvard.edu/pph2/) +- *SIFT*: prediction by [SIFT](http://sift.bii.a-star.edu.sg/) +- *Protein_position*: Relative position of amino acid in protein +- *BIOTYPE*: Biotype of transcript or regulatory feature For all samples: + **Output directory: `results/Annotation/[SAMPLE]/VEP`** - `VariantCaller_Sample_VEP.ann.vcf.gz` and `VariantCaller_Sample_VEP.ann.vcf.gz.tbi` - - VCF with Tabix index + - `VCF` with Tabix index + +For further reading and documentation see the [VEP manual](https://www.ensembl.org/info/docs/tools/vep/index.html) ## QC and reporting @@ -516,105 +577,112 @@ For all samples: #### FastQC -[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your reads. -It provides information about the quality score distribution across your reads, the per base sequence content (%T/A/G/C). -You get information about adapter contamination and other overrepresented sequences. - -For further reading and documentation see the [FastQC help](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. +It provides information about the quality score distribution across your reads, per base sequence content (`%A/T/G/C`), adapter contamination and overrepresented sequences. For all samples: + **Output directory: `results/Reports/[SAMPLE]/fastqc`** - `sample_R1_XXX_fastqc.html` and `sample_R2_XXX_fastqc.html` - - FastQC report, containing quality metrics for each pair of the raw fastq files + - `FastQC` report containing quality metrics for your untrimmed raw `FASTQ` files - `sample_R1_XXX_fastqc.zip` and `sample_R2_XXX_fastqc.zip` - - zip file containing the FastQC reports, tab-delimited data files and plot images + - Zip archive containing the FastQC report, tab-delimited data file and plot images + +> **NB:** The `FastQC` plots displayed in the `MultiQC` report shows _untrimmed_ reads. +> They may contain adapter sequence and potentially regions with low quality. + +For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). #### bamQC -[Qualimap bamqc](http://qualimap.bioinfo.cipf.es/) reports information for the evaluation of the quality of the provided alignment data. In short, the basic statistics of the alignment (number of reads, coverage, GC-content, etc.) are summarized and a number of useful graphs are produced. +[Qualimap bamqc](http://qualimap.bioinfo.cipf.es/) reports information for the evaluation of the quality of the provided alignment data. +In short, the basic statistics of the alignment (number of reads, coverage, GC-content, etc.) are summarized and a number of useful graphs are produced. Plot will show: - Stats by non-reference allele frequency, depth distribution, stats by quality and per-sample counts, singleton stats, etc. For all samples: + **Output directory: `results/Reports/[SAMPLE]/bamQC`** - `VariantCaller_[SAMPLE].bcf.tools.stats.out` - - RAW statistics used by MultiQC + - Raw statistics used by `MultiQC` -For more information about how to use Qualimap bamqc reports, see [Qualimap bamqc manual](http://qualimap.bioinfo.cipf.es/doc_html/analysis.html#id7) +For further reading and documentation see the [Qualimap bamqc manual](http://qualimap.bioinfo.cipf.es/doc_html/analysis.html#id7) -#### MarkDuplicates reports +#### GATK MarkDuplicates reports -[[GATK MarkDuplicatesSpark](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_spark_transforms_markduplicates_MarkDuplicatesSpark.php), Spark implementation of [Picard MarkDuplicates](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/picard_sam_markduplicates_MarkDuplicates.php) locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA. - -If the pipeline is run with the option `--no_gatk_spark` then [GATK MarkDuplicates](https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.4.0/picard_sam_markduplicates_MarkDuplicates.php) is used instead. - -Collecting duplicate metrics slows down performance. -To disable them use `--skip_qc MarkDuplicates`. +More information in the [GATK MarkDuplicates section](#gatk-markduplicates) Duplicates can arise during sample preparation _e.g._ library construction using PCR. Duplicate reads can also result from a single amplification cluster, incorrectly detected as multiple clusters by the optical sensor of the sequencing instrument. These duplication artifacts are referred to as optical duplicates. For all samples: + **Output directory: `results/Reports/[SAMPLE]/MarkDuplicates`** - `[SAMPLE].bam.metrics` - - RAW statistics used by MultiQC + - Raw statistics used by `MultiQC` For further reading and documentation see the [MarkDuplicates manual](https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.2.0/picard_sam_markduplicates_MarkDuplicates.php). #### samtools stats -[samtools stats](https://www.htslib.org/doc/samtools.html) collects statistics from BAM files and outputs in a text format. +[samtools stats](https://www.htslib.org/doc/samtools.html) collects statistics from `BAM` files and outputs in a text format. + Plots will show: - Alignment metrics. For all samples: + **Output directory: `results/Reports/[SAMPLE]/SamToolsStats`** - `[SAMPLE].bam.samtools.stats.out` - - RAW statistics used by MultiQC + - Raw statistics used by `MultiQC` -For further reading and documentation see the [samtools manual](https://www.htslib.org/doc/samtools.html#COMMANDS_AND_OPTIONS) +For further reading and documentation see the [`samtools` manual](https://www.htslib.org/doc/samtools.html#COMMANDS_AND_OPTIONS) #### bcftools stats -[bcftools](https://samtools.github.io/bcftools/) is a program for variant calling and manipulating files in the Variant Call Format. +[bcftools](https://samtools.github.io/bcftools/) is a program for variant calling and manipulating `VCF` files. + Plot will show: - Stats by non-reference allele frequency, depth distribution, stats by quality and per-sample counts, singleton stats, etc. For all samples: + **Output directory: `results/Reports/[SAMPLE]/BCFToolsStats`** - `VariantCaller_[SAMPLE].bcf.tools.stats.out` - - RAW statistics used by MultiQC + - Raw statistics used by `MultiQC` For further reading and documentation see the [bcftools stats manual](https://samtools.github.io/bcftools/bcftools.html#stats) #### VCFtools -[VCFtools](https://vcftools.github.io/) is a program package designed for working with VCF files. +[VCFtools](https://vcftools.github.io/) is a program package designed for working with `VCF` files. + Plots will show: -- the summary counts of each type of transition to transversion ratio for each FILTER category. +- the summary counts of each type of transition to transversion ratio for each `FILTER` category. - the transition to transversion ratio as a function of alternative allele count (using only bi-allelic SNPs). - the transition to transversion ratio as a function of SNP quality threshold (using only bi-allelic SNPs). For all samples: + **Output directory: `results/Reports/[SAMPLE]/VCFTools`** - `VariantCaller_[SAMPLE].FILTER.summary` - - RAW statistics used by MultiQC + - Raw statistics used by `MultiQC` - `VariantCaller_[SAMPLE].TsTv.count` - - RAW statistics used by MultiQC + - Raw statistics used by `MultiQC` - `VariantCaller_[SAMPLE].TsTv.qual` - - RAW statistics used by MultiQC + - Raw statistics used by `MultiQC` For further reading and documentation see the [VCFtools manual](https://vcftools.github.io/man_latest.html#OUTPUT%20OPTIONS) @@ -631,10 +699,11 @@ Plots will shows : - the quantity as function of the variant quality score. For all samples: + **Output directory: `results/Reports/[SAMPLE]/snpEff`** - `VariantCaller_Sample_snpEff.csv` - - RAW statistics used by MultiQC + - Raw statistics used by `MultiQC` - `VariantCaller_Sample_snpEff.html` - Statistics to be visualised with a web browser - `VariantCaller_Sample_snpEff.genes.txt` @@ -644,9 +713,10 @@ For further reading and documentation see the [snpEff manual](http://snpeff.sour #### VEP reports -[VEP (Variant Effect Predictor)](https://www.ensembl.org/info/docs/tools/vep/index.html), based on Ensembl, is a tools to determine the effects of all sorts of variants, including SNPs, indels, structural variants, CNVs. +[VEP (Variant Effect Predictor)](https://www.ensembl.org/info/docs/tools/vep/index.html), based on `Ensembl`, is a tools to determine the effects of all sorts of variants, including SNPs, indels, structural variants, CNVs. For all samples: + **Output directory: `results/Reports/[SAMPLE]/VEP`** - `VariantCaller_Sample_VEP.summary.html` @@ -658,17 +728,30 @@ For further reading and documentation see the [VEP manual](https://www.ensembl.o #### MultiQC -[MultiQC](http://multiqc.info) is a visualisation tool that generates a single HTML report summarising all samples in your project. -Most of the pipeline QC results are visualised in the report and further statistics are available in within the report data directory. +[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarizing all samples in your project. +Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. + +The pipeline has special steps which also allow the software versions to be reported in the `MultiQC` output for future traceability. + +**Output files:** + +- `multiqc/` + - `multiqc_report.html` + - Standalone HTML file that can be viewed in your web browser + - `multiqc_data/` + - Directory containing parsed statistics from the different tools used in the pipeline + - `multiqc_plots/` + - Directory containing static images from the report in various formats + +For more information about how to use `MultiQC` reports, see [https://multiqc.info](https://multiqc.info). -The pipeline has special steps which allow the software versions used to be reported in the MultiQC output for future traceability. +## Pipeline information -For the whole Sarek run: -**Output directory: `results/Reports/MultiQC`** +[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. -- `multiqc_report.html` - - MultiQC report - a standalone HTML file that can be viewed in your web browser -- `multiqc_data/` - - Directory containing parsed statistics from the different tools used in the pipeline +**Output files:** -For further reading and documentation see the [MultiQC website](http://multiqc.info) +- `pipeline_info/` + - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. + - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.csv`. + - Documentation for interpretation of results in HTML format: `results_description.html`. diff --git a/docs/reference.md b/docs/reference.md deleted file mode 100644 index 741c61bd1b..0000000000 --- a/docs/reference.md +++ /dev/null @@ -1,62 +0,0 @@ -# Genomes and reference files - -## AWS iGenomes - -Sarek is using [AWS iGenomes](https://ewels.github.io/AWS-iGenomes/), which facilitate storing and sharing references. -Sarek currently uses `GRCh38` by default. -`GRCh37`, `GRCh38` and `GRCm38` are available with `--genome GRCh37`, `--genome GRCh38` or `--genome GRCm38` respectively with any profile using the `conf/igenomes.config` file, or you can specify it with `-c conf/igenomes.config`. -Use `--genome smallGRCh37` to map against a small reference genome based on GRCh37. -Settings in `igenomes.config` can be tailored to your needs. - -### Intervals - -To speed up some preprocessing and variant calling processes, the reference is chopped into smaller pieces. -The intervals are chromosomes cut at their centromeres (so each chromosome arm processed separately) also additional unassigned contigs. -We are ignoring the `hs37d5` contig that contains concatenated decoy sequences. -Parts of preprocessing and variant calling are done by these intervals, and the different resulting files are then merged. -This can parallelize processes, and push down wall clock time significantly. - -The calling intervals can be defined using a `.list` or a `.bed` file. -A `.list` file contains one interval per line in the format `chromosome:start-end` (1-based coordinates). - -When the intervals file is in BED format, the file must be a tab-separated text file with one interval per line. -There must be at least three columns: chromosome, start, and end. -In BED format, the coordinates are 0-based, so the interval `chrom:1-10` becomes `chrom010`. - -Additionally, the "score" column of the BED file can be used to provide an estimate of how many seconds it will take to call variants on that interval. -The fourth column remains unused. -Example (the fields would actually be tab-separated, this is not shown here): - -`chr1 10000 207666 NA 47.3` - -This indicates that variant calling on the interval chr1:10001-207666 takes approximately 47.3 seconds. - -The runtime estimate is used in two different ways. -First, when there are multiple consecutive intervals in the file that take little time to compute, they are processed as a single job, thus reducing the number of processes that needs to be spawned. -Second, the jobs with largest processing time are started first, which reduces wall-clock time. -If no runtime is given, a time of 1000 nucleotides per second is assumed. -Actual figures vary from 2 nucleotides/second to 30000 nucleotides/second. - -If no intervals files are specified, one will be automatically generated following: - -```bash -awk -v FS='\t' -v OFS='\t' '{ print \$1, \"0\", \$2 }' .fasta.fai > .bed -``` - -To disable this feature, please use [`--no_intervals`](usage.md#--no_intervals) - -### Working with whole exome (WES) or panel data - -The `--targetBED` parameter does _not_ imply that the workflow is running alignment or variant calling only for the supplied targets. -Instead, we are aligning for the whole genome, and selecting variants only at the very end by intersecting with the provided target file. -Adding every exon as an interval in case of WES can generate >200K processes or jobs, much more forks, and similar number of directories in the Nextflow work directory. -Furthermore, primers and/or baits are not 100% specific, (certainly not for MHC and KIR, etc.), quite likely there going to be reads mapping to multiple locations. -If you are certain that the target is unique for your genome (all the reads will certainly map to only one location), and aligning to the whole genome is an overkill, better to change the reference itself. - -### Working with other genomes - -> :warning: This is a new feature, in active development, so usage could change. - -Sarek can also do limited preprocessing from any genome, providing a `fasta` file as a reference genome, followed by limited variant calling using `mpileup`, `Manta` and `Strelka`. - -Limited support for `TAIR10`, `EB2`, `UMD3.1`, `bosTau8`, `WBcel235`, `ce10`, `CanFam3.1`, `canFam3`, `GRCz10`, `danRer10`, `BDGP6`, `dm6`, `EquCab2`, `equCab2`, `EB1`, `Galgal4`, `galGal4`, `Gm01`, `hg38`, `hg19`, `Mmul_1`, `mm10`, `IRGSP-1.0`, `CHIMP2.1.4`, `panTro4`, `Rnor_6.0`, `rn6`, `R64-1-1`, `sacCer3`, `EF2`, `Sbi1`, `Sscrofa10.2`, `susScr3`, `AGPv3`. diff --git a/docs/sentieon.md b/docs/sentieon.md deleted file mode 100644 index d675386df0..0000000000 --- a/docs/sentieon.md +++ /dev/null @@ -1,73 +0,0 @@ -# nf-core/sarek: Usage with sentieon - -- [Introduction](#introduction) -- [Sentieon Analysis Pipelines & Tools](#sentieon-analysis-pipelines--tools) - - [Alignment](#alignment) - - [Germline SNV/INDEL Variant Calling - DNAseq](#germline-snvindel-variant-calling---dnaseq) - - [Germline SNV/INDEL Variant Calling - DNAscope](#germline-snvindel-variant-calling---dnascope) - - [Somatic SNV/INDEL Variant Calling - TNscope](#somatic-snvindel-variant-calling---tnscope) - - [Structural Variant Calling](#structural-variant-calling) -- [usage](#usage) - - [--sentieon](#--sentieon) - - [--tools](#--tools) - -## Introduction - -[Sentieon](https://www.sentieon.com/) is a commercial solution to process genomics data with high computing efficiency, fast turnaround time, exceptional accuracy, and 100% consistency. - -If [Sentieon](https://www.sentieon.com/) is available, use this `--sentieon` params to enable with Sarek to use some Sentieon Analysis Pipelines & Tools. - -Please refer to the [nf-core/configs](https://github.com/nf-core/configs#adding-a-new-pipeline-specific-config) repository on how to make a pipeline-specific configuration file based on the [munin-sarek specific configuration file](https://github.com/nf-core/configs/blob/master/conf/pipeline/sarek/munin.config). - -Or ask us on the [nf-core Slack](http://nf-co.re/join/slack) on the following channels: [#sarek](https://nfcore.slack.com/channels/sarek) or [#configs](https://nfcore.slack.com/channels/configs). - -## Sentieon Analysis Pipelines & Tools - -The following Sentieon Analysis Pipelines & Tools are available within Sarek. - -### Alignment - -> Sentieon BWA matches BWA-MEM with > 2X speedup. - -This tool is enabled by default within Sarek if `--sentieon` is specified and if the pipeline is started with the `mapping` [step](usage.md#--step). - -### Germline SNV/INDEL Variant Calling - DNAseq - -> Precision FDA award-winning software. -> Matches GATK 3.3-4.1, and without down-sampling. -> Results up to 10x faster and 100% consistent every time. - -This tool is enabled within Sarek if `--sentieon` is specified and if `--tools DNAseq` is specified cf [--tools](#--tools). - -### Germline SNV/INDEL Variant Calling - DNAscope - -> Improved accuracy and genome characterization. -> Machine learning enhanced filtering producing top variant calling accuracy. - -This tool is enabled within Sarek if `--sentieon` is specified and if `--tools DNAscope` is specified cf [--tools](#--tools). - -### Somatic SNV/INDEL Variant Calling - TNscope - -> Winner of ICGC-TCGA DREAM challenge. -> Improved accuracy, machine learning enhanced filtering. -> Supports molecular barcodes and unique molecular identifiers. - -This tool is enabled within Sarek if `--sentieon` is specified and if `--tools TNscope` is specified cf [--tools](#--tools). - -### Structural Variant Calling - -> Germline and somatic SV calling, including translocations, inversions, duplications and large INDELs - -This tool is enabled within Sarek if `--sentieon` is specified and if `--tools DNAscope` is specified cf [--tools](#--tools). - -## usage - -### --sentieon - -Adds the following tools for the [`--tools`](#--tools) options: `DNAseq`, `DNAscope` and `TNscope`. - -### --tools - -For main usage of tools, follow the [usage/tools](usage.md#--tools) documentation. - -With `--sentieon` the following tools options are also available within Sarek: `DNAseq`, `DNAscope` and `TNscope`. diff --git a/docs/usage.md b/docs/usage.md index 90461d1bfe..6a75ae6ba2 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,20 +1,52 @@ # nf-core/sarek: Usage -- [Introduction](#introduction) - [Running the pipeline](#running-the-pipeline) - [Updating the pipeline](#updating-the-pipeline) - [Reproducibility](#reproducibility) -- [Main arguments](#main-arguments) +- [Core Nextflow arguments](#core-nextflow-arguments) - [-profile](#-profile) - - [--input](#--input) + - [-resume](#-resume) + - [-c](#-c) + - [Custom resource requests](#custom-resource-requests) + - [Running in the background](#running-in-the-background) + - [Nextflow memory requirements](#nextflow-memory-requirements) +- [Pipeline specific arguments](#pipeline-specific-arguments) - [--step](#--step) + - [--input](#--input) + - [--input <FASTQ> --step mapping](#--input-fastq---step-mapping) + - [--input <uBAM> --step mapping](#--input-ubam---step-mapping) + - [--input <sample/> --step mapping](#--input-sample---step-mapping) + - [--input <TSV> --step prepare_recalibration](#--input-tsv---step-prepare_recalibration) + - [--input <TSV> --step prepare_recalibration --skip_markduplicates](#--input-tsv---step-prepare_recalibration---skip_markduplicates) + - [--input <TSV> --step recalibrate](#--input-tsv---step-recalibrate) + - [--input <TSV> --step recalibrate --skip_markduplicates](#--input-tsv---step-recalibrate---skip_markduplicates) + - [--input <TSV> --step variant_calling](#--input-tsv---step-variant_calling) + - [--input <TSV> --step Control-FREEC](#--input-tsv---step-control-freec) + - [--input <VCF> --step annotate](#--input-vcf---step-annotate) - [--help](#--help) - [--no_intervals](#--no_intervals) - [--nucleotides_per_second](#--nucleotides_per_second) - [--sentieon](#--sentieon) + - [Alignment](#alignment) + - [Germline SNV/INDEL Variant Calling - DNAseq](#germline-snvindel-variant-calling---dnaseq) + - [Germline SNV/INDEL Variant Calling - DNAscope](#germline-snvindel-variant-calling---dnascope) + - [Somatic SNV/INDEL Variant Calling - TNscope](#somatic-snvindel-variant-calling---tnscope) + - [Structural Variant Calling](#structural-variant-calling) - [--skip_qc](#--skip_qc) - [--target_bed](#--target_bed) - - [--tools](#--tools) + - [--tools for Variant Calling](#--tools-for-variant-calling) + - [Germline variant calling](#germline-variant-calling) + - [Somatic variant calling with tumor - normal pairs](#somatic-variant-calling-with-tumor---normal-pairs) + - [Somatic variant calling with tumor only samples](#somatic-variant-calling-with-tumor-only-samples) + - [--tools --sentieon](#--tools---sentieon) + - [--tools for Annotation](#--tools-for-annotation) + - [Annotation tools](#annotation-tools) + - [Using genome specific containers](#using-genome-specific-containers) + - [Download cache](#download-cache) + - [Using downloaded cache](#using-downloaded-cache) + - [Using VEP CADD plugin](#using-vep-cadd-plugin) + - [Downloading CADD files](#downloading-cadd-files) + - [Using VEP GeneSplicer plugin](#using-vep-genesplicer-plugin) - [Modify fastqs (trim/split)](#modify-fastqs-trimsplit) - [--trim_fastq](#--trim_fastq) - [--clip_r1](#--clip_r1) @@ -25,6 +57,7 @@ - [--save_trimmed](#--save_trimmed) - [--split_fastq](#--split_fastq) - [Preprocessing](#preprocessing) + - [--aligner](#--aligner) - [--markdup_java_options](#--markdup_java_options) - [--no_gatk_spark](#--no_gatk_spark) - [--save_bam_mapped](#--save_bam_mapped) @@ -39,6 +72,10 @@ - [--no_strelka_bp](#--no_strelka_bp) - [--pon](#--pon) - [--pon_index](#--pon_index) + - [--ignore_soft_clipped_bases](#--ignore_soft_clipped_bases) + - [--umi](#--umi) + - [--read_structure1](#--read_structure1) + - [--read_structure2](#--read_structure2) - [Annotation](#annotation) - [--annotate_tools](#--annotate_tools) - [--annotation_cache](#--annotation_cache) @@ -86,46 +123,24 @@ - [--plaintext_email](#--plaintext_email) - [--max_multiqc_email_size](#--max_multiqc_email_size) - [-name](#-name) - - [-resume](#-resume) - - [-c](#-c) - [--custom_config_version](#--custom_config_version) - [--custom_config_base](#--custom_config_base) - [Job resources](#job-resources) - [Automatic resubmission](#automatic-resubmission) - - [Custom resource requests](#custom-resource-requests) - [--max_memory](#--max_memory) - [--max_time](#--max_time) - [--max_cpus](#--max_cpus) - [--single_cpu_mem](#--single_cpu_mem) +- [Containers](#containers) + - [Building your owns](#building-your-owns) + - [Build with Conda](#build-with-conda) + - [Build with Docker](#build-with-docker) + - [Pull with Docker](#pull-with-docker) + - [Pull with Singularity](#pull-with-singularity) - [AWSBatch specific parameters](#awsbatch-specific-parameters) - [--awsqueue](#--awsqueue) - [--awsregion](#--awsregion) - [--awscli](#--awscli) -- [Deprecated params](#deprecated-params) - - [--annotateVCF](#--annotatevcf) - - [--noGVCF](#--nogvcf) - - [--noReports](#--noreports) - - [--noStrelkaBP](#--nostrelkabp) - - [--nucleotidesPerSecond](#--nucleotidespersecond) - - [--publishDirMode](#--publishdirmode) - - [--sample](#--sample) - - [--sampleDir](#--sampledir) - - [--skipQC](#--skipqc) - - [--targetBED](#--targetbed) - -## Introduction - -Nextflow handles job submissions on SLURM or other environments, and supervises running the jobs. -Thus the Nextflow process must run until the pipeline is finished. -We recommend that you put the process running in the background through `screen` / `tmux` or similar tool. -Alternatively you can run nextflow within a cluster job submitted your job scheduler. - -It is recommended to limit the Nextflow Java virtual machines memory. -We recommend adding the following line to your environment (typically in `~/.bashrc` or `~./bash_profile`): - -```bash -NXF_OPTS='-Xms1g -Xmx4g' -``` ## Running the pipeline @@ -150,11 +165,10 @@ results # Finished results (configurable, see below) The nf-core/sarek pipeline comes with more documentation about running the pipeline, found in the `docs/` directory: - [Output and how to interpret the results](output.md) -- [Extra Documentation on annotation](annotation.md) ### Updating the pipeline -When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. +When you run the above command, `Nextflow` automatically pulls the pipeline code from `GitHub` and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: @@ -168,36 +182,41 @@ It's a good idea to specify a pipeline version when running the pipeline on your This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. -First, go to the [nf-core/sarek releases page](https://github.com/nf-core/sarek/releases) and find the latest version number - numeric only (eg. `2.6`). -Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 2.6`. +First, go to the [nf-core/sarek releases page](https://github.com/nf-core/sarek/releases) and find the latest version number - numeric only (eg. `2.6.1`). +Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 2.6.1`. This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. -## Main arguments +## Core Nextflow arguments + +> **NB:** These options are part of `Nextflow` and use a _single_ hyphen (pipeline parameters use a double-hyphen). ### -profile -Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. +Use this parameter to choose a configuration profile. +Profiles can give configuration presets for different compute environments. -Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Conda) - see below. +Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (`Docker`, `Singularity`, `Conda`) - see below. -> We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. +> We highly recommend the use of `Docker` or `Singularity` containers for full pipeline reproducibility, however when this is not possible, `Conda` is also supported. -The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). +The pipeline also dynamically loads configurations from [github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. +For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important! They are loaded in sequence, so later profiles can overwrite earlier profiles. -If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended. +If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. +This is _not_ recommended. - `docker` - A generic configuration profile to be used with [Docker](http://docker.com/) - - Pulls software from dockerhub: [`nfcore/sarek`](http://hub.docker.com/r/nfcore/sarek/) + - Pulls software from DockerHub: [`nfcore/sarek`](http://hub.docker.com/r/nfcore/sarek/) - `singularity` - A generic configuration profile to be used with [Singularity](https://sylabs.io/docs/) - Pulls software from DockerHub: [`nfcore/sarek`](http://hub.docker.com/r/nfcore/sarek/) - `conda` - - Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker or Singularity. + - Please only use `Conda` as a last resort i.e. when it's not possible to run the pipeline with `Docker` or `Singularity`. - A generic configuration profile to be used with [conda](https://conda.io/docs/) - Pulls most software from [Bioconda](https://bioconda.github.io/) - `test` @@ -205,103 +224,644 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - Includes links to test data so needs no other parameters - `test_annotation` - A profile with a complete configuration for automated testing - - Includes links to test data so needs no other parameters + - Input data is a `VCF` for testing annotation - `test_no_gatk_spark` - A profile with a complete configuration for automated testing - - Includes links to test data so needs no other parameters + - Specify `--no_gatk_spark` - `test_split_fastq` - A profile with a complete configuration for automated testing - - Includes links to test data so needs no other parameters + - Specify `--split_fastq 500` - `test_targeted` - A profile with a complete configuration for automated testing - - Includes links to test data so needs no other parameters + - Include link to a target `BED` file and use `Manta` and `Strelka` for Variant Calling - `test_tool` - A profile with a complete configuration for automated testing - - Includes links to test data so needs no other parameters + - Test directly Variant Calling with a specific TSV file and `--step variantcalling` - `test_trimming` - A profile with a complete configuration for automated testing - - Includes links to test data so needs no other parameters + - Test trimming options +- `test_umi_qiaseq` + - A profile with a complete configuration for automated testing + - Test a specific `UMI` structure +- `test_umi_tso` + - A profile with a complete configuration for automated testing + - Test a specific `UMI` structure + +### -resume + +Specify this when restarting a pipeline. +`Nextflow` will used cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. + +You can also supply a run name or a session ID to resume a specific run: `-resume [run-name/session id]`. +Use the `nextflow log` command to show previous run names and session IDs. + +### -c + +Specify the path to a specific config file (this is a core `Nextflow` command). +See the [nf-core website documentation](https://nf-co.re/usage/configuration) for more information. + +#### Custom resource requests + +Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. +For most of the steps in the pipeline, if the job exits with an error code of `143` (exceeded requested resources), it will automatically resubmit with higher requests (2 x original, then 3 x original). +If it still fails after three times then the pipeline is stopped. + +Whilst these default requirements will hopefully work for most people with most data, you may find that you want to customise the compute resources that the pipeline requests. +You can do this by creating a custom config file. +For example, to give the workflow process `VEP` 32GB of memory, you could use the following config: + +```nextflow +process { + withName: VEP { + memory = 32.GB + } +} +``` + +See the main [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for more information. + +If you are likely to be running `nf-core` pipelines regularly it may be a good idea to request that your custom config file is uploaded to the `nf-core/configs` git repository. +Before you do this please can you test that the config file works with your pipeline of choice using the `-c` parameter see [-c section](#-c). +You can then create a pull request to the `nf-core/configs` repository with the addition of your config file, associated documentation file (see examples in [`nf-core/configs/docs`](https://github.com/nf-core/configs/tree/master/docs)), and amending [`nfcore_custom.config`](https://github.com/nf-core/configs/blob/master/nfcore_custom.config) to include your custom profile. + +If you have any questions or issues please send us a message on [Slack](https://nf-co.re/join/slack) on the [`#configs` channel](https://nfcore.slack.com/channels/configs). + +### Running in the background + +`Nextflow` handles job submissions and supervises the running jobs. +The `Nextflow` process must run until the pipeline is finished. + +The `Nextflow` `-bg` flag launches Nextflow in the background, detached from your terminal so that the workflow does not stop if you log out of your session. The logs are saved to a file. + +Alternatively, you can use `screen` / `tmux` or similar tool to create a detached session which you can log back into at a later time. +Some HPC setups also allow you to run nextflow within a cluster job submitted your job scheduler (from where it submits more jobs). + +#### Nextflow memory requirements + +In some cases, the `Nextflow` Java virtual machines can start to request a large amount of memory. +We recommend adding the following line to your environment to limit this (typically in `~/.bashrc` or `~./bash_profile`): + +```bash +NXF_OPTS='-Xms1g -Xmx4g' +``` + +## Pipeline specific arguments + +### --step + +> **NB** only one step must be specified + +Use this to specify the starting step + +Default: `mapping` + +Available: `mapping`, `prepare_recalibration`, `recalibrate`, `variant_calling`, `annotate`, `Control-FREEC` + +> **NB** step can be specified with no concern for case, or the presence of `-` or `_` ### --input -Use this to specify the location of your input TSV file. -For example: -TSV file should correspond to the correct step, see [`--step`](#--step) and [input](input.md) documentation for more information +Use this to specify the location of your input `TSV` (Tab Separated Values) file. + +> **NB** Delimiter is the tab (`\t`) character, and no header is required + +There are different kinds of `TSV` files that can be used as input, depending on the input files available (`FASTQ`, `unmapped BAM`, `recalibrated BAM`...). +The `TSV` file should correspond to the correct step, see [`--step`](#--step) for more information. +For all possible `TSV` files, described in the next sections, here is an explanation of what the columns refer to: + +`Sarek` auto-generates `TSV` files for all and for each individual samples, depending of the options specified. + +- `subject` designates the subject, it should be the ID of the subject, and it must be unique for each subject, but one subject can have multiple samples (e.g. +normal and tumor) +- `sex` are the sex chromosomes of the subject, (ie `XX`, `XY`...) +- `status` is the status of the measured sample, (`0` for Normal or `1` for Tumor) +- `sample` designates the sample, it should be the ID of the sample (it is possible to have more than one tumor sample for each subject, i.e. +a tumor and a relapse), it must be unique, but samples can have multiple lanes (which will later be merged) +- `lane` is used when the sample is multiplexed on several lanes, it must be unique for each lane in the same sample (but does not need to be the original lane name), and must contain at least one character +- `fastq1` is the path to the first pair of the `FASTQ` file +- `fastq2` is the path to the second pair of the `FASTQ` file +- `bam` is the path to the `BAM` file +- `bai` is the path to the `BAM` index file +- `recaltable` is the path to the recalibration table +- `mpileup` is the path to the mpileup file + +It is recommended to add the absolute path of the files, but relative path should also work. + +If necessary, a tumor sample can be associated to a normal sample as a pair, if specified with the same `subject`and a different `sample`. +An additional tumor sample (such as a relapse for example), can be added if specified with the same `subject` and a different `sample`. + +`Sarek` will output results in a different directory for each sample. +If multiple samples are specified in the `TSV` file, `Sarek` will consider all files to be from different samples. +Multiple `TSV` files can be specified if the path is enclosed in quotes. + +Output from Variant Calling and/or Annotation will be in a specific directory for each sample (or normal/tumor pair if applicable). + +#### --input <FASTQ> --step mapping + +The `TSV` file to start with the step mapping with paired-end `FASTQs` should contain the columns: + +`subject sex status sample lane fastq1 fastq2` + +In this example (`example_fastq.tsv`), there are 3 read groups. + +| | | | | | | +|-|-|-|-|-|-| +|SUBJECT_ID|XX|0|SAMPLE_ID|1|/samples/normal1_1.fastq.gz|/samples/normal1_2.fastq.gz| +|SUBJECT_ID|XX|0|SAMPLE_ID|2|/samples/normal2_1.fastq.gz|/samples/normal2_2.fastq.gz| +|SUBJECT_ID|XX|0|SAMPLE_ID|3|/samples/normal3_1.fastq.gz|/samples/normal3_2.fastq.gz| ```bash ---input +--input example_fastq.tsv ``` -Multiple TSV files can be specified, using a [glob path](https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob), if enclosed in quotes. +Or, for a normal/tumor pair: -Use this to specify the location to a directory with fastq files for the `mapping` step of single germline samples only. -For example: +In this example (`example_pair_fastq.tsv`), there are 3 read groups for the normal sample and 2 for the tumor sample. + +| | | | | | | +|-|-|-|-|-|-| +|SUBJECT_ID|XX|0|SAMPLE_ID1|1|/samples/normal1_1.fastq.gz|/samples/normal1_2.fastq.gz| +|SUBJECT_ID|XX|0|SAMPLE_ID1|2|/samples/normal2_1.fastq.gz|/samples/normal2_2.fastq.gz| +|SUBJECT_ID|XX|0|SAMPLE_ID1|3|/samples/normal3_1.fastq.gz|/samples/normal3_2.fastq.gz| +|SUBJECT_ID|XX|1|SAMPLE_ID2|1|/samples/tumor1_1.fastq.gz|/samples/tumor1_2.fastq.gz| +|SUBJECT_ID|XX|1|SAMPLE_ID2|2|/samples/tumor2_1.fastq.gz|/samples/tumor2_2.fastq.gz| ```bash ---input +--input example_pair_fastq.tsv +``` + +#### --input <uBAM> --step mapping + +The `TSV` file for starting the mapping from `unmapped BAM` files should contain the columns: + +- `subject sex status sample lane bam` + +In this example (`example_ubam.tsv`), there are 3 read groups. + +| | | | | | | +|-|-|-|-|-|-| +|SUBJECT_ID|XX|0|SAMPLE_ID|1|/samples/normal_1.bam| +|SUBJECT_ID|XX|0|SAMPLE_ID|2|/samples/normal_2.bam| +|SUBJECT_ID|XX|0|SAMPLE_ID|3|/samples/normal_3.bam| + +```bash +--input example_ubam.tsv +``` + +Or, for a normal/tumor pair: + +In this example (`example_pair_ubam.tsv`), there are 3 read groups for the normal sample and 2 for the tumor sample. + +| | | | | | | +|-|-|-|-|-|-| +|SUBJECT_ID|XX|0|SAMPLE_ID1|1|/samples/normal_1.bam| +|SUBJECT_ID|XX|0|SAMPLE_ID1|2|/samples/normal_2.bam| +|SUBJECT_ID|XX|0|SAMPLE_ID1|3|/samples/normal_3.bam| +|SUBJECT_ID|XX|1|SAMPLE_ID2|1|/samples/tumor_1.bam| +|SUBJECT_ID|XX|1|SAMPLE_ID2|2|/samples/tumor_2.bam| + +```bash +--input example_pair_ubam.tsv ``` -Use this to specify the location of your VCF input file on `annotate` step. +#### --input <sample/> --step mapping + +Use this to specify the location to a directory with `FASTQ` files for the `mapping` step of a single germline sample only. For example: ```bash ---input +--input +``` + +> **NB** All of the found `FASTQ` files are considered to belong to the same sample. + +The input folder, containing the `FASTQ` files for one subject (ID) should be organized into one sub-folder for every sample. +The given directory is searched recursively for `FASTQ` files that are named `*_R1_*.fastq.gz`, and a matching pair with the same name except `_R2_` instead of `_R1_` is expected to exist alongside. +All `FASTQ` files for that sample should be collected here. + +```text +ID ++--sample1 ++------sample1___lane1_R1_1000.fastq.gz ++------sample1___lane1_R2_1000.fastq.gz ++------sample1___lane2_R1_1000.fastq.gz ++------sample1___lane2_R2_1000.fastq.gz ++--sample2 ++------sample2___lane1_R1_1000.fastq.gz ++------sample2___lane1_R2_1000.fastq.gz ++--sample3 ++------sample3___lane1_R1_1000.fastq.gz ++------sample3___lane1_R2_1000.fastq.gz ++------sample3___lane2_R1_1000.fastq.gz ++------sample3___lane2_R2_1000.fastq.gz ``` -Multiple VCF files can be specified, using a [glob path](https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob), if enclosed in quotes. +`FASTQ` filename structure: -### --step +- `____R1_.fastq.gz` and +- `____R2_.fastq.gz` -Use this to specify the starting step: -Default `mapping` -Available: `mapping`, `prepare_recalibration`, `recalibrate`, `variant_calling`, `annotate`, `Control-FREEC` +Where: + +- `sample` = sample id +- `lib` = identifier of library preparation +- `flowcell-index` = identifier of flow cell for the sequencing run +- `lane` = identifier of the lane of the sequencing run + +Read group information will be parsed from `FASTQ` file names according to this: + +- `RGID` = "sample_lib_flowcell_index_lane" +- `RGPL` = "Illumina" +- `PU` = sample +- `RGLB` = lib + +Each `FASTQ` file pair gets its own read group (`@RG`) in the resulting `BAM` file in the following way. + +- The sample name (`SM`) is derived from the the last component of the path given to `--input`. +That is, you should make sure that that directory has a meaningful name! For example, with `--input=/my/fastqs/sample123`, the sample name will be `sample123`. +- The read group id is set to *flowcell.samplename.lane*. +The flowcell id and lane number are auto-detected from the name of the first read in the `FASTQ` file. + +#### --input <TSV> --step prepare_recalibration + +To start from the preparation of the recalibration step (`--step prepare_recalibration`), a `TSV` file needs to be given as input containing the paths to the `non-recalibrated BAM` files. +The `Sarek`-generated `TSV` file is stored under `results/Preprocessing/TSV/duplicates_marked_no_table.tsv` and will automatically be used as an input when specifying the parameter `--step prepare_recalibration`. + +The `TSV` contains the following columns: + +- `subject sex status sample bam bai` + +| | | | | | | +|-|-|-|-|-|-| +|SUBJECT_ID|XX|0|SAMPLE_ID|/samples/normal.md.bam|/samples/normal.md.bai| + +Or, for a normal/tumor pair: + +| | | | | | | +|-|-|-|-|-|-| +|SUBJECT_ID|XX|0|SAMPLE_ID1|/samples/normal.md.bam|/samples/normal.md.bai| +|SUBJECT_ID|XX|1|SAMPLE_ID2|/samples/tumor.md.bam|/samples/tumor.md.bai| + +#### --input <TSV> --step prepare_recalibration --skip_markduplicates + +The `Sarek`-generated `TSV` file is stored under `results/Preprocessing/TSV/mapped.tsv` and will automatically be used as an input when specifying the parameter `--step prepare_recalibration --skip_markduplicates`. +The `TSV` file contains the same columns, but the content is slightly different: + +| | | | | | | +|-|-|-|-|-|-| +|SUBJECT_ID|XX|0|SAMPLE_ID|/samples/normal.bam|/samples/normal.bai| + +Or, for a normal/tumor pair: + +| | | | | | | +|-|-|-|-|-|-| +|SUBJECT_ID|XX|0|SAMPLE_ID1|/samples/normal.bam|/samples/normal.bai| +|SUBJECT_ID|XX|1|SAMPLE_ID2|/samples/tumor.bam|/samples/tumor.bai| + +#### --input <TSV> --step recalibrate + +To start from the recalibrate step (`--step recalibrate`), a `TSV` file needs to be given as input containing the paths to the `non-recalibrated BAM` file and the associated recalibration table. +The `Sarek`-generated `TSV` file is stored under `results/Preprocessing/TSV/duplicates_marked.tsv` and will automatically be used as an input when specifying the parameter `--step recalibrate`. + +The `TSV` contains the following columns: + +- `subject sex status sample bam bai recaltable` + +| | | | | | | | +|-|-|-|-|-|-|-| +|SUBJECT_ID|XX|0|SAMPLE_ID|/samples/normal.md.bam|/samples/normal.md.bai|/samples/normal.recal.table| + +Or, for a normal/tumor pair: + +| | | | | | | +|-|-|-|-|-|-| +|SUBJECT_ID|XX|0|SAMPLE_ID1|/samples/normal.md.bam|/samples/normal.md.bai|/samples/normal.recal.table| +|SUBJECT_ID|XX|1|SAMPLE_ID2|/samples/tumor.md.bam|/samples/tumor.md.bai|/samples/tumor.recal.table| + +#### --input <TSV> --step recalibrate --skip_markduplicates + +The `Sarek`-generated `TSV` file is stored under `results/Preprocessing/TSV/mapped_no_duplicates_marked.tsv` and will automatically be used as an input when specifying the parameter `--step recalibrate --skip_markduplicates`. +The `TSV` file contains the same columns, but the content is slightly different: + +| | | | | | | | +|-|-|-|-|-|-|-| +|SUBJECT_ID|XX|0|SAMPLE_ID|/samples/normal.bam|/samples/normal.bai|/samples/normal.recal.table| + +Or, for a normal/tumor pair: + +| | | | | | | | +|-|-|-|-|-|-|-| +|SUBJECT_ID|XX|0|SAMPLE_ID1|/samples/normal.bam|/samples/normal.bai|/samples/normal.recal.table| +|SUBJECT_ID|XX|1|SAMPLE_ID2|/samples/tumor.bam|/samples/tumor.bai|/samples/tumor.recal.table| + +#### --input <TSV> --step variant_calling + +To start from the variant calling step (`--step variant_calling`), a `TSV` file needs to be given as input containing the paths to the `recalibrated BAM` file and the associated index. +The `Sarek`-generated `TSV` file is stored under `results/Preprocessing/TSV/recalibrated.tsv` and will automatically be used as an input when specifying the parameter `--step variant_calling`. + +The `TSV` file should contain the columns: + +- `subject sex status sample bam bai` + +Here is an example for two samples from the same subject: + +| | | | | | | +|-|-|-|-|-|-| +|SUBJECT_ID|XX|0|SAMPLE_ID|/samples/normal.recal.bam|/samples/normal.recal.bai| + +Or, for a normal/tumor pair: + +| | | | | | | +|-|-|-|-|-|-| +|SUBJECT_ID|XX|0|SAMPLE_ID1|/samples/normal.recal.bam|/samples/normal.recal.bai| +|SUBJECT_ID|XX|1|SAMPLE_ID2|/samples/tumor.recal.bam|/samples/tumor.recal.bai| + +#### --input <TSV> --step Control-FREEC + +To start from the Control-FREEC step (`--step Control-FREEC`), a `TSV` file needs to be given as input containing the paths to the mpileup files. +The `Sarek`-generated `TSV` file is stored under `results/VariantCalling/TSV/control-freec_mpileup.tsv` and will automatically be used as an input when specifying the parameter `--step Control-FREEC`. + +The `TSV` file should contain the columns: + +- `subject sex status sample mpileup` + +Here is an example for one normal/tumor pair from one subjects: + +| | | | | | +|-|-|-|-|-| +|SUBJECT_ID|XX|0|SAMPLE_ID1|/samples/normal.pileup| +|SUBJECT_ID|XX|1|SAMPLE_ID2|/samples/tumor.pileup| + +#### --input <VCF> --step annotate + +Input files for Sarek can be specified using the path to a `VCF` file given to the `--input` command only with the annotation step (`--step annotate`). +As `Sarek` will use `bgzip` and `tabix` to compress and index `VCF` files annotated, it expects `VCF` files to be sorted. +Multiple `VCF` files can be specified, using a [glob path](https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob), if enclosed in quotes. +For example: + +```bash +--step annotate --input "results/VariantCalling/*/{HaplotypeCaller,Manta,Mutect2,Strelka,TIDDIT}/*.vcf.gz" +``` ### --help -Will display the help message +Will display the help message. ### --no_intervals -Disable usage of intervals file, and disable automatic generation of intervals file when none are provided. +Disable usage of [`intervals`](#--intervals) file. ### --nucleotides_per_second -Use this to estimate of how many seconds it will take to call variants on any interval, the default value is `1000` is it's not specified in the [`intervals`](#--intervals) file. +Use this to estimate of how many seconds it will take to call variants on any interval, the default value is `1000` when not specified in the [`intervals`](#--intervals) file. ### --sentieon -If [Sentieon](https://www.sentieon.com/) is available, use this to enable it for preprocessing, and variant calling. -Adds the following tools for the [`--tools`](#--tools) options: `DNAseq`, `DNAscope` and `TNscope`. +[Sentieon](https://www.sentieon.com/) is a commercial solution to process genomics data with high computing efficiency, fast turnaround time, exceptional accuracy, and 100% consistency. + +If [Sentieon](https://www.sentieon.com/) is available, use this `--sentieon` params to enable with `Sarek` to use some `Sentieon Analysis Pipelines & Tools`. +Adds the following tools for the [`--tools`](#--tools---sentieon) options: `DNAseq`, `DNAscope` and `TNscope`. + +Please refer to the [nf-core/configs](https://github.com/nf-core/configs#adding-a-new-pipeline-specific-config) repository on how to make a pipeline-specific configuration file based on the [munin-sarek specific configuration file](https://github.com/nf-core/configs/blob/master/conf/pipeline/sarek/munin.config). + +Or ask us on the [nf-core Slack](http://nf-co.re/join/slack) on the following channels: [#sarek](https://nfcore.slack.com/channels/sarek) or [#configs](https://nfcore.slack.com/channels/configs). + +The following `Sentieon Analysis Pipelines & Tools` are available within `Sarek`: + +#### Alignment + +> Sentieon BWA matches BWA-MEM with > 2X speedup. + +This tool is enabled by default within `Sarek` if `--sentieon` is specified and if the pipeline is started with the `mapping` [step](usage.md#--step). + +#### Germline SNV/INDEL Variant Calling - DNAseq + +> Precision FDA award-winning software. +> Matches GATK 3.3-4.1, and without down-sampling. +> Results up to 10x faster and 100% consistent every time. + +This tool is enabled within `Sarek` if `--sentieon` is specified and if `--tools DNAseq` is specified cf [--tools --sentieon](#--tools--sentieon). + +#### Germline SNV/INDEL Variant Calling - DNAscope -More information in the [sentieon](sentieon.md) documentation. +> Improved accuracy and genome characterization. +> Machine learning enhanced filtering producing top variant calling accuracy. + +This tool is enabled within `Sarek` if `--sentieon` is specified and if `--tools DNAscope` is specified cf [--tools --sentieon](#--tools--sentieon). + +#### Somatic SNV/INDEL Variant Calling - TNscope + +> Winner of ICGC-TCGA DREAM challenge. +> Improved accuracy, machine learning enhanced filtering. +> Supports molecular barcodes and unique molecular identifiers. + +This tool is enabled within `Sarek` if `--sentieon` is specified and if `--tools TNscope` is specified cf [--tools --sentieon](#--tools--sentieon). + +#### Structural Variant Calling + +> Germline and somatic SV calling, including translocations, inversions, duplications and large INDELs + +This tool is enabled within `Sarek` if `--sentieon` is specified and if `--tools DNAscope` is specified cf [--tools --sentieon](#--tools--sentieon). ### --skip_qc Use this to disable specific QC and Reporting tools. Multiple tools can be specified, separated by commas. -Available: `all`, `bamQC`, `BaseRecalibrator`, `BCFtools`, `Documentation`, `FastQC`, `MultiQC`, `samtools`, `vcftools`, `versions` + +Available: `all`, `bamQC`, `BaseRecalibrator`, `BCFtools`, `Documentation`, `FastQC`, `MarkDuplicates`, `MultiQC`, `samtools`, `vcftools`, `versions` + Default: `None` +> **NB** `--skip_qc MarkDuplicates` does not skip `MarkDuplicates` but prevent the collection of duplicate metrics that slows down performance + ### --target_bed -Use this to specify the target BED file for targeted or whole exome sequencing. +Use this to specify the target `BED` file for targeted or whole exome sequencing. + +The `--target_bed` parameter does _not_ imply that the workflow is running alignment or variant calling only for the supplied targets. +Instead, we are aligning for the whole genome, and selecting variants only at the very end by intersecting with the provided target file. +Adding every exon as an interval in case of `WES` can generate >200K processes or jobs, much more forks, and similar number of directories in the Nextflow work directory. +Furthermore, primers and/or baits are not 100% specific, (certainly not for MHC and KIR, etc.), quite likely there going to be reads mapping to multiple locations. +If you are certain that the target is unique for your genome (all the reads will certainly map to only one location), and aligning to the whole genome is an overkill, better to change the reference itself. -### --tools +The recommended flow for targeted sequencing data is to use the workflow as it is, but also provide a `BED` file containing targets for all steps using the `--target_bed` option. +The workflow will pick up these intervals, and activate any `--exome` flag in any tools that allow it to process deeper coverage. +It is advised to pad the variant calling regions (exons or target) to some extent before submitting to the workflow. +To add the target `BED` file configure the command line like: + +```bash +--target_bed +``` + +### --tools for Variant Calling Use this parameter to specify the variant calling and annotation tools to be used. Multiple tools can be specified, separated by commas. For example: ```bash ---tools 'Strelka,mutect2,SnpEff' +--tools Strelka,mutect2,SnpEff ``` -Available variant callers: `ASCAT`, `ControlFREEC`, `FreeBayes`, `HaplotypeCaller`, `Manta`, `mpileup`, `MSIsensor`, `Mutect2`, `Strelka`, `TIDDIT`. +Available variant callers: `ASCAT`, `Control-FREEC`, `FreeBayes`, `HaplotypeCaller`, `Manta`, `mpileup`, `MSIsensor`, `Mutect2`, `Strelka`, `TIDDIT`. + +> **NB** Tools can be specified with no concern for case + +For more information on the individual variant callers, and where to find the variant calling results, check the [output](output.md) documentation. > **WARNING** Not all variant callers are available for both germline and somatic variant calling. -For more details please check the [variant calling](variant_calling.md) extra documentation. -Available annotation tools: `VEP`, `SnpEff`, `merge`. For more details, please check the [annotation](annotation.md) extra documentation. +For more information, please read the following documentation on [Germline variant calling](#germline-variant-calling), [Somatic variant calling with tumor - normal pairs](#somatic-variant-calling-with-tumor---normal-pairs) and [Somatic variant calling with tumor only samples](#somatic-variant-calling-with-tumor-only-samples) + +#### Germline variant calling + +Using `Sarek`, germline variant calling will always be performed if a variant calling tool with a germline mode is selected. +Germline variant calling can currently only be performed with the following variant callers: + +- *FreeBayes* +- *HaplotypeCaller* +- *Manta* +- *mpileup* +- *Sentieon* +- *Strelka* +- *TIDDIT* + +For more information on the individual variant callers, and where to find the variant calling results, check the [output](output.md) documentation. + +#### Somatic variant calling with tumor - normal pairs + +Using `Sarek`, somatic variant calling will be performed, if your input tsv file contains tumor / normal pairs (see [input](#--input) documentation for more information). +Different samples belonging to the same patient, where at least one is marked as normal (`0` in the `Status` column) and at least one is marked as tumor (`1` in the `Status` column) are treated as tumor / normal pairs. + +If tumor-normal pairs are provided, both germline variant calling and somatic variant calling will be performed, provided that the selected variant caller allows for it. +If the selected variant caller allows only for somatic variant calling, then only somatic variant calling results will be generated. + +Here is a list of the variant calling tools that support somatic variant calling: + +- *ASCAT* +- *Control-FREEC* +- *FreeBayes* +- *Manta* +- *MSIsensor* +- *Mutect2* +- *Sentieon* +- *Strelka* + +For more information on the individual variant callers, and where to find the variant calling results, check the [output](output.md) documentation. + +#### Somatic variant calling with tumor only samples + +Somatic variant calling with only tumor samples (no matching normal sample), is not recommended by the `GATK best practices`. +This is just supported for a limited variant callers. + +Here is a list of the variant calling tools that support tumor-only somatic variant calling: + +- *Manta* +- *mpileup* +- *Mutect2* +- *TIDDIT* + +### --tools --sentieon + +> **WARNING** Only with `--sentieon` + +If [Sentieon](https://www.sentieon.com/) is available, use this `--sentieon` params to enable with `Sarek` to use some `Sentieon Analysis Pipelines & Tools`. +Adds the following tools for the [`--tools`](#--tools) options: `DNAseq`, `DNAscope` and `TNscope`. + +### --tools for Annotation + +Available annotation tools: `VEP`, `SnpEff`, `merge`. +For more details, please check the [annotation](#annotation-tools) documentation. + +#### Annotation tools + +With `Sarek`, annotation is done using `snpEff`, `VEP`, or even both consecutively: + +- `--tools snpEff` + - To annotate using `snpEff` +- `--tools VEP` + - To annotate using `VEP` +- `--tools snpEff,VEP` + - To annotate using `snpEff` and `VEP` +- `--tools merge` + - To annotate using `snpEff` followed by `VEP` + +`VCF` produced by `Sarek` will be annotated if `snpEff` or `VEP` are specified with the `--tools` command. +As Sarek will use `bgzip` and `tabix` to compress and index VCF files annotated, it expects VCF files to be sorted. + +In these examples, all command lines will be launched starting with `--step annotate`. +It can of course be started directly from any other step instead. + +#### Using genome specific containers + +`Sarek` has already designed containers with `snpEff` and `VEP` files for Human (`GRCh37`, `GRCh38`), Mouse (`GRCm38`), Dog (`CanFam3.1`) and Roundworm (`WBcel235`). + +Default settings will run using these containers. +The main `Sarek` container has also `snpEff` and `VEP` installed, but without the cache files that can be downloaded separately. +See [containers documentation](#containers) for more information. + +#### Download cache + +A `Nextflow` helper script has been designed to help downloading `snpEff` and `VEP` caches. +Such files are meant to be shared between multiple users, so this script is mainly meant for people administrating servers, clusters and advanced users. + +```bash +nextflow run download_cache.nf --snpeff_cache --snpeff_db --genome +nextflow run download_cache.nf --vep_cache --species --vep_cache_version --genome +``` + +#### Using downloaded cache + +Both `snpEff` and `VEP` enable usage of cache. +If cache is available on the machine where `Sarek` is run, it is possible to run annotation using cache. +You need to specify the cache directory using `--snpeff_cache` and `--vep_cache` in the command lines or within configuration files. +The cache will only be used when `--annotation_cache` and cache directories are specified (either in command lines or in a configuration file). + +Example: + +```bash +nextflow run nf-core/sarek --tools snpEff --step annotate --sample --snpeff_cache --annotation_cache +nextflow run nf-core/sarek --tools VEP --step annotate --sample --vep_cache --annotation_cache +``` + +#### Using VEP CADD plugin + +To enable the use of the `VEP` `CADD` plugin: + +- Download the `CADD` files +- Specify them (either on the command line, like in the example or in a configuration file) +- use the `--cadd_cache` flag + +Example: + +```bash +nextflow run nf-core/sarek --step annotate --tools VEP --sample --cadd_cache \ + --cadd_indels \ + --cadd_indels_tbi \ + --cadd_wg_snvs \ + --cadd_wg_snvs_tbi +``` + +#### Downloading CADD files + +An helper script has been designed to help downloading `CADD` files. +Such files are meant to be share between multiple users, so this script is mainly meant for people administrating servers, clusters and advanced users. + +```bash +nextflow run download_cache.nf --cadd_cache --cadd_version --genome +``` + +#### Using VEP GeneSplicer plugin + +To enable the use of the `VEP` `GeneSplicer` plugin: + +- use the `--genesplicer` flag + +Example: + +```bash +nextflow run nf-core/sarek --step annotate --tools VEP --sample --genesplicer +``` ## Modify fastqs (trim/split) @@ -311,36 +871,36 @@ Use this to perform adapter trimming with [Trim Galore](https://github.com/Felix ### --clip_r1 -Instructs Trim Galore to remove a number of bp from the 5' end of read 1 (or single-end reads). +Instructs `Trim Galore` to remove a number of bp from the 5' end of read 1 (or single-end reads). This may be useful if the qualities were very poor, or if there is some sort of unwanted bias at the 5' end. ### --clip_r2 -Instructs Trim Galore to remove a number of bp from the 5' end of read 2 (paired-end reads only). +Instructs `Trim Galore` to remove a number of bp from the 5' end of read 2 (paired-end reads only). This may be useful if the qualities were very poor, or if there is some sort of unwanted bias at the 5' end. ### --three_prime_clip_r1 -Instructs Trim Galore to remove a number of bp from the 3' end of read 1 (or single-end reads) AFTER adapter/quality trimming has been performed. +Instructs `Trim Galore` to remove a number of bp from the 3' end of read 1 (or single-end reads) AFTER adapter/quality trimming has been performed. This may remove some unwanted bias from the 3' end that is not directly related to adapter sequence or basecall quality. ### --three_prime_clip_r2 -Instructs Trim Galore to remove a number of bp from the 3' end of read 2 AFTER adapter/quality trimming has been performed. +Instructs `Trim Galore` to remove a number of bp from the 3' end of read 2 AFTER adapter/quality trimming has been performed. This may remove some unwanted bias from the 3' end that is not directly related to adapter sequence or basecall quality. ### --trim_nextseq This enables the option `--nextseq-trim=3'CUTOFF` within `Cutadapt`, which will set a quality cutoff (that is normally given with `-q` instead), but qualities of G bases are ignored. -This trimming is in common for the NextSeq and NovaSeq-platforms, where basecalls without any signal are called as high-quality G bases. +This trimming is common for the `NextSeq` and `NovaSeq` platforms, where basecalls without any signal are called as high-quality G bases. ### --save_trimmed -Option to keep trimmed FASTQs +Option to keep trimmed `FASTQs` ### --split_fastq -Use the Nextflow [`splitFastq`](https://www.nextflow.io/docs/latest/operator.html#splitfastq) operator to specify how many reads should be contained in the split fastq file. +Use the `Nextflow` [`splitFastq`](https://www.nextflow.io/docs/latest/operator.html#splitfastq) operator to specify how many reads should be contained in the split fastq file. For example: ```bash @@ -349,9 +909,33 @@ For example: ## Preprocessing +### --aligner + +To control which aligner is used for mapping the reads. + +Available: `bwa-mem` and `bwa-mem2` + +Default: `bwa-mem` + +Example: + +```bash +--aligner "bwa-mem" +``` + +> **WARNING** Current indices for `bwa` in AWS iGenomes are not compatible with `bwa-mem2`. +> Use `--bwa=false` to have `Sarek` build them automatically. + +Example: + +```bash +--aligner "bwa-mem2" --bwa=false +``` + ### --markdup_java_options -To control the java options necessary for the GATK `MarkDuplicates` process, you can set this parameter. +To control the java options necessary for the `GATK MarkDuplicates` process, you can set this parameter. + Default: "-Xms4000m -Xmx7g" For example: @@ -361,41 +945,45 @@ For example: ### --no_gatk_spark -Use this to disable usage of GATK Spark implementation of their tools in local mode. +Use this to disable usage of `Spark` implementation of the `GATK` tools in local mode. ### --save_bam_mapped -Will save mapped BAMs. +Will save `mapped BAMs`. ### --skip_markduplicates -Will skip MarkDuplicates. This params will also save the mapped BAMS, to enable restart from step `prepare_recalibration` +Will skip `GATK MarkDuplicates`. +This params will also save the `mapped BAMS`, to enable restart from step `prepare_recalibration` ## Variant Calling ### --ascat_ploidy -Use this parameter to overwrite default behavior from ASCAT regarding ploidy. -Requires that [`--ascat_purity`](#--ascat_purity) is set +Use this parameter to overwrite default behavior from `ASCAT` regarding `ploidy`. +Requires that [`--ascat_purity`](#--ascat_purity) is set. ### --ascat_purity -Use this parameter to overwrite default behavior from ASCAT regarding purity. -Requires that [`--ascat_ploidy`](#--ascat_ploidy) is set +Use this parameter to overwrite default behavior from `ASCAT` regarding `purity`. +Requires that [`--ascat_ploidy`](#--ascat_ploidy) is set. ### --cf_coeff -Control-FREEC `coefficientOfVariation` -Default: 0.015 +Use this parameter to overwrite default behavior from `Control-FREEC` regarding `coefficientOfVariation` + +Default: `0.015` ### --cf_ploidy -Control-FREEC `ploidy` -Default: 2 +Use this parameter to overwrite default behavior from `Control-FREEC` regarding `ploidy` + +Default: `2` ### --cf_window -Control-FREEC `window size` +Use this parameter to overwrite default behavior from `Control-FREEC` regarding `window size` + Default: Disabled ### --no_gvcf @@ -408,39 +996,67 @@ Use this not to use `Manta` `candidateSmallIndels` for `Strelka` (not recommende ### --pon -When a panel of normals [PON](https://gatkforums.broadinstitute.org/gatk/discussion/24057/how-to-call-somatic-mutations-using-gatk4-mutect2#latest) is defined, it will be use to filter somatic calls. -Without PON, there will be no calls with PASS in the INFO field, only an _unfiltered_ VCF is written. -It is recommended to make your own panel-of-normals, as it depends on sequencer and library preparation. -For tests in iGenomes there is a dummy PON file in the Annotation/GermlineResource directory, but it _should not be used_ as a real panel-of-normals file. -Provide your PON by: +When a [panel of normals (PON)](https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON) is defined, it will be use to filter somatic calls. +Without `PON`, there will be no calls with `PASS` in the `INFO` field, only an _unfiltered_ `VCF` is written. +It is recommended to make your own `PON`, as it depends on sequencer and library preparation. +For tests in `iGenomes` there is a dummy `PON` file in the Annotation/GermlineResource directory, but it _should not be used_ as a real panel-of-normals file. +Provide your `PON` with: ```bash --pon ``` -PON file should be bgzipped. +`PON` file should be bgzipped. ### --pon_index -Tabix index of the panel-of-normals bgzipped VCF file. -If none provided, will be generated automatically from the panel-of-normals bgzipped VCF file. +Tabix index of the `PON` bgzipped VCF file. +If none provided, will be generated automatically from the `PON` bgzipped VCF file. + +### --ignore_soft_clipped_bases + +Do not analyze soft clipped bases in the reads for `GATK Mutect2` with the `--dont-use-soft-clipped-bases` params. + +### --umi + +If provided, `Unique Molecular Identifiers (UMIs)` steps will be run to extract and annotate the reads with `UMIs` and create consensus reads. +This part of the pipeline uses [fgbio](https://github.com/fulcrumgenomics/fgbio) to convert the `FASTQ` files into a `unmapped BAM`, where reads are tagged with the `UMIs` extracted from the `FASTQ` sequences. +In order to allow the correct tagging, the `UMI` sequence must be contained in the read sequence itself, and not in the `FASTQ` filename. +Following this step, the `unmapped BAM` is aligned and reads are then grouped based on mapping position and `UMI` tag. +Finally, reads in the same groups are collapsed to create a consensus read. +To create consensus, we have chosen to use the *adjacency method* [ref](https://cgatoxford.wordpress.com/2015/08/14/unique-molecular-identifiers-the-problem-the-solution-and-the-proof/). +In order for the correct tagging to be performed, a read structure needs to be specified as indicated below. + +### --read_structure1 + +When processing `UMIs`, a read structure should always be provided for each of the `FASTQ` files, to allow the correct annotation of the `BAM` file. +If the read does not contain any `UMI`, the structure will be +T (i.e. only template of any length). +The read structure follows a format adopted by different tools, and described [here](https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures) + +### --read_structure2 + +When processing `UMIs`, a read structure should always be provided for each of the `FASTQ` files, to allow the correct annotation of the `BAM` file. +If the read does not contain any UMI, the structure will be +T (i.e. only template of any length). +The read structure follows a format adopted by different tools, and described [here](https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures) ## Annotation ### --annotate_tools -Specify from which tools Sarek should look for VCF files to annotate, only for step `Annotate`. +Specify from which tools `Sarek` should look for `VCF` files to annotate, only for step `Annotate`. + Available: `HaplotypeCaller`, `Manta`, `Mutect2`, `Strelka`, `TIDDIT` + Default: `None` ### --annotation_cache -Enable usage of annotation cache, and disable usage of already built containers within Sarek. -For more information, follow the [annotation guidelines](annotation.md#using-downloaded-cache). +Enable usage of annotation cache, and disable usage of already built containers within `Sarek`. +For more information, follow the [downloaded cache guidelines](#using-downloaded-cache). ### --snpeff_cache -To be used conjointly with [`--annotation_cache`](#--annotation_cache), specify the cache snpEff directory: +To be used conjointly with [`--annotation_cache`](#--annotation_cache), specify the cache `snpEff` directory: ```bash --snpeff_cache @@ -448,7 +1064,7 @@ To be used conjointly with [`--annotation_cache`](#--annotation_cache), specify ### --vep_cache -To be used conjointly with [`--annotation_cache`](#--annotation_cache), specify the cache VEP directory: +To be used conjointly with [`--annotation_cache`](#--annotation_cache), specify the cache `VEP` directory: ```bash --vep_cache @@ -456,39 +1072,39 @@ To be used conjointly with [`--annotation_cache`](#--annotation_cache), specify ### --cadd_cache -Enable CADD cache. +Enable `CADD` cache. ### --cadd_indels -Path to CADD InDels file. +Path to `CADD InDels` file. ### --cadd_indels_tbi -Path to CADD InDels index. +Path to `CADD InDels` index. ### --cadd_wg_snvs -Path to CADD SNVs file. +Path to `CADD SNVs` file. ### --cadd_wg_snvs_tbi -Path to CADD SNVs index. +Path to `CADD SNVs` index. ### --genesplicer -Enable genesplicer within VEP. +Enable `genesplicer` within `VEP`. ## Reference genomes -The pipeline config files come bundled with paths to the Illumina iGenomes reference index files. -If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource. +The pipeline config files come bundled with paths to the `Illumina iGenomes` reference index files. +The configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource. ### --genome (using iGenomes) -There are 2 different species supported by Sarek in the iGenomes references. +Sarek is using [AWS iGenomes](https://ewels.github.io/AWS-iGenomes/), which facilitate storing and sharing references. To run the pipeline, you must specify which to use with the `--genome` flag. -You can find the keys to specify the genomes in the [iGenomes config file](../conf/igenomes.config). +You can find the keys to specify the genomes in the [iGenomes config file](../conf/igenomes.config)). Genomes that are supported are: - Homo sapiens @@ -515,70 +1131,70 @@ Limited support for: - `--genome ce10` (UCSC) - Canis familiaris - - `--genome CanFam3.1` (Ensembl) - - `--genome canFam3` (UCSC) + - `--genome CanFam3.1` (Ensembl) + - `--genome canFam3` (UCSC) - Danio rerio - - `--genome GRCz10` (Ensembl) - - `--genome danRer10` (UCSC) + - `--genome GRCz10` (Ensembl) + - `--genome danRer10` (UCSC) - Drosophila melanogaster - - `--genome BDGP6` (Ensembl) - - `--genome dm6` (UCSC) + - `--genome BDGP6` (Ensembl) + - `--genome dm6` (UCSC) - Equus caballus - - `--genome EquCab2` (Ensembl) - - `--genome equCab2` (UCSC) + - `--genome EquCab2` (Ensembl) + - `--genome equCab2` (UCSC) - Escherichia coli K 12 DH10B - - `--genome EB1` (Ensembl) + - `--genome EB1` (Ensembl) - Gallus gallus - - `--genome Galgal4` (Ensembl) - - `--genome galgal4` (UCSC) + - `--genome Galgal4` (Ensembl) + - `--genome galgal4` (UCSC) - Glycine max - - `--genome Gm01` (Ensembl) + - `--genome Gm01` (Ensembl) - Homo sapiens - - `--genome hg19` (UCSC) - - `--genome hg38` (UCSC) + - `--genome hg19` (UCSC) + - `--genome hg38` (UCSC) - Macaca mulatta - - `--genome Mmul_1` (Ensembl) + - `--genome Mmul_1` (Ensembl) - Mus musculus - - `--genome mm10` (Ensembl) + - `--genome mm10` (Ensembl) - Oryza sativa japonica - - `--genome IRGSP-1.0` (Ensembl) + - `--genome IRGSP-1.0` (Ensembl) - Pan troglodytes - - `--genome CHIMP2.1.4` (Ensembl) - - `--genome panTro4` (UCSC) + - `--genome CHIMP2.1.4` (Ensembl) + - `--genome panTro4` (UCSC) - Rattus norvegicus - - `--genome Rnor_6.0` (Ensembl) - - `--genome rn6` (UCSC) + - `--genome Rnor_6.0` (Ensembl) + - `--genome rn6` (UCSC) - Saccharomyces cerevisiae - - `--genome R64-1-1` (Ensembl) - - `--genome sacCer3` (UCSC) + - `--genome R64-1-1` (Ensembl) + - `--genome sacCer3` (UCSC) - Schizosaccharomyces pombe - - `--genome EF2` (Ensembl) + - `--genome EF2` (Ensembl) - Sorghum bicolor - - `--genome Sbi1` (Ensembl) + - `--genome Sbi1` (Ensembl) - Sus scrofa - - `--genome Sscrofa10.2` (Ensembl) - - `--genome susScr3` (UCSC) + - `--genome Sscrofa10.2` (Ensembl) + - `--genome susScr3` (UCSC) - Zea mays - - `--genome AGPv3` (Ensembl) + - `--genome AGPv3` (Ensembl) -Note that you can use the same configuration setup to save sets of reference files for your own use, even if they are not part of the iGenomes resource. +Note that you can use the same configuration setup to save sets of reference files for your own use, even if they are not part of the `AWS iGenomes` resource. See the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for instructions on where to save such a file. The syntax for this reference configuration is as follows: @@ -615,6 +1231,7 @@ params { ### --igenomes_base Specify base path to AWS iGenomes + Default: `s3://ngi-igenomes/igenomes/` ### --igenomes_ignore @@ -624,21 +1241,13 @@ You may choose this option if you observe clashes between custom parameters and This option will load the `genomes.config` file instead. You can then specify the `--genome custom` and specify any reference file on the command line or within a config file. -```bash ---igenomes_ignore -``` - ### --genomes_base Specify base path to reference genome ### --save_reference -Enable saving reference indexes and other files built within Sarek. - -```bash ---save_reference -``` +Enable saving reference indexes and other files built within `Sarek`. ### --ac_loci @@ -694,7 +1303,7 @@ If you prefer, you can specify the full path to your reference genome when you r If you prefer, you can specify the full path to your reference genome when you run the pipeline: -> If none provided, will be generated automatically from the fasta reference. +> If none provided, will be generated automatically from dbsnp `VCF` file. ```bash --dbsnp_index @@ -731,8 +1340,8 @@ If you prefer, you can specify the full path to your reference genome when you r ### --germline_resource The [germline resource VCF file](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_mutect_Mutect2.php#--germline-resource) (bgzipped and tabixed) needed by GATK4 Mutect2 is a collection of calls that are likely present in the sample, with allele frequencies. -The AF info field must be present. -You can find a smaller, stripped gnomAD VCF file (most of the annotation is removed and only calls signed by PASS are stored) in the iGenomes Annotation/GermlineResource folder. +The `AF` info field must be present. +You can find a smaller, stripped gnomAD `VCF` file (most of the annotation is removed and only calls signed by PASS are stored) in the `AWS iGenomes` `Annotation/GermlineResource` folder. If you prefer, you can specify the full path to your reference genome when you run the pipeline: ```bash @@ -744,7 +1353,7 @@ If you prefer, you can specify the full path to your reference genome when you r Tabix index of the germline resource specified at [`--germline_resource`](#--germline_resource). If you prefer, you can specify the full path to your reference genome when you run the pipeline: -> If none provided, will be generated automatically from the fasta reference. +> If none provided, will be generated automatically from the germline resource `VCF` file. ```bash --germline_resource_index @@ -752,8 +1361,33 @@ If you prefer, you can specify the full path to your reference genome when you r ### --intervals -Used to speed up Preprocessing and/or Variant Calling, for more information, read the [intervals section in the extra documentation on reference](reference.md#Intervals). +To speed up some preprocessing and variant calling processes, the reference is chopped into smaller pieces. +The intervals are chromosomes cut at their centromeres (so each chromosome arm processed separately) also additional unassigned contigs. +We are ignoring the `hs37d5` contig that contains concatenated decoy sequences. +Parts of preprocessing and variant calling are done by these intervals, and the different resulting files are then merged. +This can parallelize processes, and push down wall clock time significantly. + +The calling intervals can be defined using a `.list` or a `.bed` file. +A `.list` file contains one interval per line in the format `chromosome:start-end` (1-based coordinates). + +When the intervals file is in `BED` format, the file must be a tab-separated text file with one interval per line. +There must be at least three columns: chromosome, start, and end. +In `BED` format, the coordinates are 0-based, so the interval `chrom:1-10` becomes `chrom010`. + +Additionally, the `score` column of the `BED` file can be used to provide an estimate of how many seconds it will take to call variants on that interval. +The fourth column remains unused. + +|||||| +|-|-|-|-|-| +|chr1|10000|207666|NA|47.3| + +This indicates that variant calling on the interval chr1:10001-207666 takes approximately 47.3 seconds. +The runtime estimate is used in two different ways. +First, when there are multiple consecutive intervals in the file that take little time to compute, they are processed as a single job, thus reducing the number of processes that needs to be spawned. +Second, the jobs with largest processing time are started first, which reduces wall-clock time. +If no runtime is given, a time of 1000 nucleotides per second is assumed. +Actual figures vary from 2 nucleotides/second to 30000 nucleotides/second. If you prefer, you can specify the full path to your reference genome when you run the pipeline: > If none provided, will be generated automatically from the fasta reference. @@ -775,7 +1409,7 @@ If you prefer, you can specify the full path to your reference genome when you r If you prefer, you can specify the full path to your reference genome when you run the pipeline: -> If none provided, will be generated automatically from the fasta reference. +> If none provided, will be generated automatically from the known indels `VCF` file. ```bash --known_indels_index @@ -799,7 +1433,12 @@ If you prefer, you can specify the DB version when you run the pipeline: ### --species -This specifies the species used for running VEP annotation. For human data, this needs to be set to `homo_sapiens`, for mouse data `mus_musculus` as the annotation needs to know where to look for appropriate annotation references. If you use iGenomes or a local resource with `genomes.conf`, this has already been set for you appropriately. +This specifies the species used for running `VEP` annotation. +If you use iGenomes or a local resource with `genomes.conf`, this has already been set for you appropriately. + +```bash +--species +``` ### --vep_cache_version @@ -814,21 +1453,24 @@ If you prefer, you can specify the cache version when you run the pipeline: ### --outdir The output directory where the results will be saved. + Default: `results/` ### --publish_dir_mode The file publishing method. + Available: `symlink`, `rellink`, `link`, `copy`, `copyNoFollow`, `move` + Default: `copy` ### --sequencing_center -The sequencing center that will be used in the BAM CN field +The sequencing center that will be used in the `BAM` `CN` field ### --multiqc_config -Specify a path to a custom MultiQC configuration file. +Specify a path to a custom `MultiQC` configuration file. ### --monochrome_logs @@ -849,39 +1491,25 @@ Set to receive plain-text e-mails instead of HTML formatted. ### --max_multiqc_email_size -Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB). +Threshold size for `MultiQC` report to be attached in notification email. +If file generated by pipeline exceeds the threshold, it will not be attached. + +Default: `25MB`. ### -name Name for the pipeline run. -If not specified, Nextflow will automatically generate a random mnemonic. - -This is used in the MultiQC report (if not default) and in the summary HTML / e-mail (always). +If not specified, `Nextflow` will automatically generate a random mnemonic. -**NB:** Single hyphen (core Nextflow option) +This is used in the `MultiQC` report (if not default) and in the summary HTML / e-mail (always). -### -resume - -Specify this when restarting a pipeline. -Nextflow will used cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. - -You can also supply a run name to resume a specific run: `-resume [run-name]`. -Use the `nextflow log` command to show previous run names. - -**NB:** Single hyphen (core Nextflow option) - -### -c - -Specify the path to a specific config file (this is a core NextFlow command). - -**NB:** Single hyphen (core Nextflow option) - -Note - you can use this to override pipeline defaults. +**NB:** Single hyphen (core `Nextflow` option) ### --custom_config_version Provide git commit id for custom Institutional configs hosted at `nf-core/configs`. This was implemented for reproducibility purposes. + Default is set to `master`. ```bash @@ -891,25 +1519,17 @@ Default is set to `master`. ### --custom_config_base -If you're running offline, nextflow will not be able to fetch the institutional config files +If you're running offline, `Nextflow` will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. -If you do need them, you should download the files from the repo and tell nextflow where to find them with the `custom_config_base` option. +If you do need them, you should download the files from the repository and tell `Nextflow` where to find them with the `custom_config_base` option. For example: ```bash -## Download and unzip the config files -cd /path/to/my/configs -wget https://github.com/nf-core/configs/archive/master.zip -unzip master.zip - -## Run the pipeline -cd /path/to/my/data -nextflow run /path/to/pipeline/ --custom_config_base /path/to/my/configs/configs-master/ +NXF_OPTS='-Xms1g -Xmx4g' ``` -> Note that the nf-core/tools helper package has a `download` command to download all required pipeline -> files + singularity containers + institutional configs in one go for you, to make this process easier. +> Note that the nf-core/tools helper package has a `download` command to download all required pipeline files + singularity containers + institutional configs in one go for you, to make this process easier. ## Job resources @@ -919,17 +1539,6 @@ Each step in the pipeline has a default set of requirements for number of CPUs, For most of the steps in the pipeline, if the job exits with an error code of `143` (exceeded requested resources) it will automatically resubmit with higher requests (2 x original, then 3 x original). If it still fails after three times then the pipeline is stopped. -### Custom resource requests - -Wherever process-specific requirements are set in the pipeline, the default value can be changed by creating a custom config file. -See the files hosted at [`nf-core/configs`](https://github.com/nf-core/configs/tree/master/conf) for examples. - -If you are likely to be running `nf-core` pipelines regularly it may be a good idea to request that your custom config file is uploaded to the `nf-core/configs` git repository. -Before you do this please can you test that the config file works with your pipeline of choice using the `-c` parameter (see definition below). -You can then create a pull request to the `nf-core/configs` repository with the addition of your config file, associated documentation file (see examples in [`nf-core/configs/docs`](https://github.com/nf-core/configs/tree/master/docs)), and amending [`nfcore_custom.config`](https://github.com/nf-core/configs/blob/master/nfcore_custom.config) to include your custom profile. - -If you have any questions or issues please send us a message on [Slack](https://nf-co.re/join/slack). - ### --max_memory Use to set a top-limit for the default memory requirement for each process. @@ -950,67 +1559,177 @@ Should be a string in the format integer-unit eg. `--max_cpus 1` Use to set memory for a single CPU. Should be a string in the format integer-unit eg. `--single_cpu_mem '8.GB'` -## AWSBatch specific parameters +## Containers -Running the pipeline on AWSBatch requires a couple of specific parameters to be set according to your AWSBatch configuration. -Please use [`-profile awsbatch`](https://github.com/nf-core/configs/blob/master/conf/awsbatch.config) and then specify all of the following parameters. +`sarek`, our main container is designed using [Conda](https://conda.io/). -### --awsqueue +[![sarek-docker status](https://img.shields.io/docker/automated/nfcore/sarek.svg)](https://hub.docker.com/r/nfcore/sarek) -The JobQueue that you intend to use on AWSBatch. +Based on [nfcore/base:1.10.2](https://hub.docker.com/r/nfcore/base/tags), it contains: -### --awsregion +- **[ASCAT](https://github.com/Crick-CancerGenomics/ascat)** 2.5.2 +- **[AlleleCount](https://github.com/cancerit/alleleCount)** 4.0.2 +- **[BCFTools](https://github.com/samtools/bcftools)** 1.9 +- **[bwa](https://github.com/lh3/bwa)** 0.7.17 +- **[bwa-mem2](https://github.com/bwa-mem2/bwa-mem2)** 2.0 +- **[CNVkit](https://github.com/etal/cnvkit)** 0.9.6 +- **[Control-FREEC](https://github.com/BoevaLab/FREEC)** 11.5 +- **[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/)** 0.11.9 +- **[fgbio](https://github.com/fulcrumgenomics/fgbio)** 1.1.0 +- **[FreeBayes](https://github.com/ekg/freebayes)** 1.3.2 +- **[GATK4-spark](https://github.com/broadinstitute/gatk)** 4.1.6.0 +- **[GeneSplicer](https://ccb.jhu.edu/software/genesplicer/)** 1.0 +- **[ggplot2](https://github.com/tidyverse/ggplot2)** 3.3.0 +- **[HTSlib](https://github.com/samtools/htslib)** 1.9 +- **[Manta](https://github.com/Illumina/manta)** 1.6.0 +- **[msisensor](https://github.com/ding-lab/msisensor)** 0.5 +- **[MultiQC](https://github.com/ewels/MultiQC/)** 1.8 +- **[Qualimap](http://qualimap.bioinfo.cipf.es)** 2.2.2d +- **[SAMBLASTER](https://github.com/GregoryFaust/samblaster)** 0.1.24 +- **[samtools](https://github.com/samtools/samtools)** 1.9 +- **[snpEff](http://snpeff.sourceforge.net/)** 4.3.1t +- **[Strelka2](https://github.com/Illumina/strelka)** 2.9.10 +- **[TIDDIT](https://github.com/SciLifeLab/TIDDIT)** 2.7.1 +- **[pigz](https://zlib.net/pigz/)** 2.3.4 +- **[Trim Galore](https://github.com/FelixKrueger/TrimGalore)** 0.6.5 +- **[VCFanno](https://github.com/brentp/vcfanno)** 0.3.2 +- **[VCFtools](https://vcftools.github.io/index.html)** 0.1.16 +- **[VEP](https://github.com/Ensembl/ensembl-vep)** 99.2 -The AWS region to run your job in. -Default is set to `eu-west-1` but can be adjusted to your needs. +For annotation, the main container can be used, but the cache has to be downloaded, or additional containers are available with cache (see [annotation documentation](#using-downloaded-cache)): -### --awscli +`sareksnpeff`, our `snpeff` container is designed using [Conda](https://conda.io/). -The [AWS CLI](https://www.nextflow.io/docs/latest/awscloud.html#aws-cli-installation) path in your custom AMI. -Default: `/home/ec2-user/miniconda/bin/aws`. +[![sareksnpeff-docker status](https://img.shields.io/docker/automated/nfcore/sareksnpeff.svg)](https://hub.docker.com/r/nfcore/sareksnpeff) -Please make sure to also set the `-w/--work-dir` and `--outdir` parameters to a S3 storage bucket of your choice - you'll get an error message notifying you if you didn't. +Based on [nfcore/base:1.10.2](https://hub.docker.com/r/nfcore/base/tags), it contains: + +- **[snpEff](http://snpeff.sourceforge.net/)** 4.3.1t +- Cache for `GRCh37`, `GRCh38`, `GRCm38`, `CanFam3.1` or `WBcel235` + +`sarekvep`, our `vep` container is designed using [Conda](https://conda.io/). + +[![sarekvep-docker status](https://img.shields.io/docker/automated/nfcore/sarekvep.svg)](https://hub.docker.com/r/nfcore/sarekvep) + +Based on [nfcore/base:1.10.2](https://hub.docker.com/r/nfcore/base/tags), it contains: + +- **[GeneSplicer](https://ccb.jhu.edu/software/genesplicer/)** 1.0 +- **[VEP](https://github.com/Ensembl/ensembl-vep)** 99.2 +- Cache for `GRCh37`, `GRCh38`, `GRCm38`, `CanFam3.1` or `WBcel235` + +### Building your owns -## Deprecated params +Our containers are designed using [Conda](https://conda.io/). +The [`environment.yml`](../environment.yml) file can be modified if particular versions of tools are more suited to your needs. -> **WARNING** These params are deprecated -- They will be removed in a future release. +The following commands can be used to build/download containers on your own system: -### --annotateVCF +- Adjust `VERSION` for sarek version (typically a release or `dev`). -> Please check: [`--input`](#--input) +#### Build with Conda -### --noGVCF +```Bash +conda env create -f environment.yml +``` + +#### Build with Docker + +- `sarek` + +```Bash +docker build -t nfcore/sarek: . +``` + +- `sareksnpeff` + +Adjust arguments for `GENOME` version and snpEff `CACHE_VERSION` + +```Bash +docker build -t nfcore/sareksnpeff:. containers/snpeff/. --build-arg GENOME= --build-arg CACHE_VERSION= +``` + +- `sarekvep` + +Adjust arguments for `GENOME` version, `SPECIES` name and VEP `VEP_VERSION` + +```Bash +docker build -t nfcore/sarekvep:. containers/vep/. --build-arg GENOME= --build-arg SPECIES= --build-arg VEP_VERSION= +``` + +#### Pull with Docker + +- `sarek` + +```Bash +docker pull nfcore/sarek: +``` + +- `sareksnpeff` + +Adjust arguments for `GENOME` version + +```Bash +docker pull nfcore/sareksnpeff:. +``` + +- `sarekvep` + +Adjust arguments for `GENOME` version -> Please check: [`--no_gvcf`](#--no_gvcf) +```Bash +docker pull nfcore/sarekvep:. +``` + +#### Pull with Singularity -### --noReports +You can directly pull singularity image, in the path used by the Nextflow ENV variable `NXF_SINGULARITY_CACHEDIR`, ie: -> Please check: [`--skipQC`](#--skipQC) +```Bash +cd $NXF_SINGULARITY_CACHEDIR +singularity build ... +``` -### --noStrelkaBP +- `sarek` -> Please check: [`--no_strelka_bp`](#--no_strelka_bp) +```Bash +singularity build nfcore-sarek-.img docker://nfcore/sarek: +``` -### --nucleotidesPerSecond +- `sareksnpeff` -> Please check: [`--nucleotides_per_second`](#--nucleotides_per_second) +Adjust arguments for `GENOME` version -### --publishDirMode +```Bash +singularity build nfcore-sareksnpeff-..img docker://nfcore/sareksnpeff:. +``` -> Please check: [`--publish_dir_mode`](#--publish_dir_mode) +- `sarekvep` -### --sample +Adjust arguments for `GENOME` version -> Please check: [`--input`](#--input) +```Bash +singularity build nfcore-sarekvep-..img docker://nfcore/sarekvep:. +``` -### --sampleDir +## AWSBatch specific parameters -> Please check: [`--input`](#--input) +Running the pipeline on AWSBatch requires a couple of specific parameters to be set according to your AWSBatch configuration. +Please use [`-profile awsbatch`](https://github.com/nf-core/configs/blob/master/conf/awsbatch.config) and then specify all of the following parameters. -### --skipQC +### --awsqueue -> Please check: [`--skip_qc`](#--skip_qc) +The JobQueue that you intend to use on AWSBatch. -### --targetBED +### --awsregion -> Please check: [`--target_bed`](#--target_bed) +The AWS region to run your job in. + +Default is set to `eu-west-1` but can be adjusted to your needs. + +### --awscli + +The [AWS CLI](https://www.nextflow.io/docs/latest/awscloud.html#aws-cli-installation) path in your custom AMI. + +Default: `/home/ec2-user/miniconda/bin/aws`. + +Please make sure to also set the `-w/--work-dir` and `--outdir` parameters to a S3 storage bucket of your choice - you'll get an error message notifying you if you didn't. diff --git a/docs/use_cases.md b/docs/use_cases.md deleted file mode 100644 index f49838c567..0000000000 --- a/docs/use_cases.md +++ /dev/null @@ -1,125 +0,0 @@ -# Use cases - -The workflow has two pre-processing options: `mapping` and `recalibrate`. -Using the `mapping` directive one will have a pair of mapped, deduplicated and recalibrated BAM files in the `Preprocessing/Recalibrated/` directory. -This is the usual option you have to give when you are starting from raw FASTQ data: - -```bash -nextflow run nf-core/sarek --input --tools -``` - -`mapping` will start by default, you do not have to give any additional parameters, only the TSV file describing the sample (see below). - -During the execution of the workflow a `execution_trace.txt`, a `execution_timeline.html` and a `execution_report.html` files are generated automatically. -These files contain statistics about resources used and processes finished. -If you start a new workflow or restart/resume a sample, the previous version will be renamed as `execution_trace.txt.1`, `execution_timeline.html.1` and `execution_report.html.1` respectively. -Also, older version are renamed with incremented numbers. - -## Starting from raw FASTQ - pair of FASTQ files - -The workflow should be started in this case with the smallest set of options as written above: - -```bash -nextflow run nf-core/sarek --input --tools -``` - -The TSV file should look like: - -| | | | | | | | -|-|-|-|-|-|-|-| -|SUBJECT_ID|XX|0|SAMPLE_ID|1|/samples/normal_1.fastq.gz|/samples/normal_2.fastq.gz| - -See the [input files documentation](input.md) for more information. - -## Starting from raw FASTQ - a directory with normal sample only - -The `--input` option can be also used to point Sarek to a directory with FASTQ files: - -```bash -nextflow run nf-core/sarek --input --tools -``` - -The given directory is searched recursively for FASTQ files that are named `*_R1_*.fastq.gz`, and a matching pair with the same name except `_R2_` instead of `_R1_` is expected to exist alongside. -All of the found FASTQ files are considered to belong to the sample. -Each FASTQ file pair gets its own read group (`@RG`) in the resulting BAM file. - -### Metadata when using `--input` with a directory - -When using `--input` with a directory, the metadata about the sample that are written to the BAM header in the `@RG` tag are determined in the following way. - -- The sample name (`SM`) is derived from the the last component of the path given to `--input`. -That is, you should make sure that that directory has a meaningful name! For example, with `--input=/my/fastqs/sample123`, the sample name will be `sample123`. -- The read group id is set to *flowcell.samplename.lane*. -The flowcell id and lane number are auto-detected from the name of the first read in the FASTQ file. - -## Starting from raw FASTQ - pair of FASTQ files for tumor/normal samples - -The workflow command line is just the same as before, but the TSV contains extra lines. -You can see the second column is used to distinguish normal and tumor samples. -You can add as many relapse samples as many you have, providing their name in the third column is different. -Each will be compared to the normal one-by-one. -Usually there are more read groups - sequencing lanes - for a single sequencing run, and in a flowcell different lanes have to be recalibrated separately. -This is captured in the TSV file only in the following manner, adding read group numbers or IDs in the fourth column. -All lanes belonging to the same Sample will be merged together after the FASTQ pairs are mapped to the reference genome. -Obviously, if you do not have relapse samples, you can leave out the two last lines. - -| | | | | | | | -|-|-|-|-|-|-|-| -|SUBJECT_ID|XX|0|SAMPLE_ID_N|1|/path/to/normal1_1.fastq.gz|/path/to/normal1_2.fastq.gz| -|SUBJECT_ID|XX|0|SAMPLE_ID_N|2|/path/to/normal2_1.fastq.gz|/path/to/normal2_2.fastq.gz| -|SUBJECT_ID|XX|1|SAMPLE_ID_T|3|/path/to/tumor3_1.fastq.gz|/path/to/tumor3_2.fastq.gz| -|SUBJECT_ID|XX|1|SAMPLE_ID_T|4|/path/to/tumor4_1.fastq.gz|/path/to/tumor4_2.fastq.gz| -|SUBJECT_ID|XX|1|SAMPLE_ID_T|5|/path/to/tumor5_1.fastq.gz|/path/to/tumor5_2.fastq.gz| -|SUBJECT_ID|XX|1|SAMPLE_ID_R|7|/path/to/relapse7_1.fastq.gz|/path/to/relapse7_2.fastq.gz| -|SUBJECT_ID|XX|1|SAMPLE_ID_R|9|/path/to/relapse9_1.fastq.gz|/path/to/relapse9_2.fastq.gz| - -See the [input files documentation](input.md) for more information. - -## Starting from recalibration - -```bash -nextflow run nf-core/sarek --input --step recalibrate --tools -``` - -And the corresponding TSV file should be like: -Obviously, if you do not have tumor or relapse samples, you can leave out the two last lines. - -| | | | | | | | -|-|-|-|-|-|-|-| -|SUBJECT_ID|XX|0|SAMPLE_ID_N|/path/to/SAMPLE_ID_N.bam|/path/to/SAMPLE_ID_N.bai|/path/to/SAMPLE_ID_N.recal.table| -|SUBJECT_ID|XX|1|SAMPLE_ID_T|/path/to/SAMPLE_ID_T.bam|/path/to/SAMPLE_ID_T.bai|/path/to/SAMPLE_ID_T.recal.table| -|SUBJECT_ID|XX|1|SAMPLE_ID_R|/path/to/SAMPLE_ID_R.bam|/path/to/SAMPLE_ID_R.bai|/path/to/SAMPLE_ID_R.recal.table| - -See the [input files documentation](input.md) for more information. - -## Starting from a recalibrated BAM file - -At this step we are assuming that all the required preprocessing is over, we only want to run variant callers or other tools using recalibrated BAM files. - -```bash -nextflow run nf-core/sarek --step variantcalling --tools -``` - -And the corresponding TSV file should be like: - -| | | | | | | -|-|-|-|-|-|-| -|SUBJECT_ID|XX|0|SAMPLE_ID_N|/path/to/SAMPLE_ID_N.bam|/path/to/SAMPLE_ID_N.bai| -|SUBJECT_ID|XX|1|SAMPLE_ID_T|/path/to/SAMPLE_ID_T.bam|/path/to/SAMPLE_ID_T.bai| -|SUBJECT_ID|XX|1|SAMPLE_ID_R|/path/to/SAMPLE_ID_R.bam|/path/to/SAMPLE_ID_R.bai| - -See the [input files documentation](input.md) for more information. - -If you want to restart a previous run of the pipeline, you may not have a recalibrated BAM file. -In this case, you need to start with `--step=recalibrate` (see previous section). - -## Using Sarek with targeted (whole exome or panel) sequencing data - -The recommended flow for targeted sequencing data is to use the workflow as it is, but also provide a BED file containing targets for all steps using the `--targetBED` option. -The workflow will pick up these intervals, and activate any `--exome` flag in any tools that allow it to process deeper coverage. -It is advised to pad the variant calling regions (exons or target) to some extent before submitting to the workflow. -To add the target BED file configure the command line like: - -```bash -nextflow run nf-core/sarek --tools haplotypecaller,strelka,mutect2 --target_bed --input -``` diff --git a/docs/variant_calling.md b/docs/variant_calling.md deleted file mode 100644 index 5b4d4a72b6..0000000000 --- a/docs/variant_calling.md +++ /dev/null @@ -1,57 +0,0 @@ -# Variant calling - -- [Germline variant calling](#germline-variant-calling) -- [Somatic variant calling with tumor - normal pairs](#somatic-variant-calling-with-tumor---normal-pairs) -- [Somatic variant calling with tumor only samples](#somatic-variant-calling-with-tumor-only-samples) - -## Germline variant calling - -Using Sarek, germline variant calling will always be performed if a variant calling tool with a germline mode is selected. -You can specify the variant caller to use with the `--tools` parameter (see [usage](./usage.md) for more information). - -Germline variant calling can currently only be performed with the following variant callers: - -- FreeBayes -- HaplotypeCaller -- Manta -- mpileup -- Sentieon (check the specific [sentieon](sentieon.md) documentation) -- Strelka -- TIDDIT - -For more information on the individual variant callers, and where to find the variant calling results, check the [output](output.md) documentation. - -## Somatic variant calling with tumor - normal pairs - -Using Sarek, somatic variant calling will be performed, if your input tsv file contains tumor / normal pairs (see [input](input.md) documentation for more information). -Different samples belonging to the same patient, where at least one is marked as normal (`0` in the `Status` column) and at least one is marked as tumor (`1` in the `Status` column) are treated as tumor / normal pairs. - -If tumor-normal pairs are provided, both germline variant calling and somatic variant calling will be performed, provided that the selected variant caller allows for it. -If the selected variant caller allows only for somatic variant calling, then only somatic variant calling results will be generated. - -Here is a list of the variant calling tools that support somatic variant calling: - -- ASCAT (check the specific [ASCAT](ascat.md) documentation) -- ControlFREEC -- FreeBayes -- Manta -- MSIsensor -- Mutect2 -- Sentieon (check the specific [sentieon](sentieon.md) documentation) -- Strelka - -For more information on the individual variant callers, and where to find the variant calling results, check the [output](output.md) documentation. - -## Somatic variant calling with tumor only samples - -Somatic variant calling with only tumor samples (no matching normal sample), is not recommended by the GATK best practices. -This is just supported for a limited variant callers. - -Here is a list of the variant calling tools that support tumor-only somatic variant calling: - -- Manta -- mpileup -- Mutect2 -- TIDDIT - -For more information on the individual variant callers, and where to find the variant calling results, check the [output](output.md) documentation. From 3fca619b3f915401d23700abe52170a48b085d9d Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 21 Sep 2020 16:04:42 +0200 Subject: [PATCH 116/200] update Nextflow required version --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9d486710a5..3646930512 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ > **An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing** -[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A519.10.0-brightgreen.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A520.07.1-brightgreen.svg)](https://www.nextflow.io/) [![nf-core](https://img.shields.io/badge/nf--core-pipeline-brightgreen.svg)](https://nf-co.re/) [![DOI](https://zenodo.org/badge/184289291.svg)](https://zenodo.org/badge/latestdoi/184289291) From 3b1d3a81105e6c4914c959ea5e55a6adec2cb106 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 21 Sep 2020 16:07:16 +0200 Subject: [PATCH 117/200] add nextflow_schema.json --- nextflow.config | 35 +- nextflow_schema.json | 736 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 753 insertions(+), 18 deletions(-) create mode 100644 nextflow_schema.json diff --git a/nextflow.config b/nextflow.config index ce60594cf7..4d96e55f67 100644 --- a/nextflow.config +++ b/nextflow.config @@ -5,16 +5,6 @@ * Default config options for all environments. */ -manifest { - name = 'nf-core/sarek' - author = 'Maxime Garcia, Szilveszter Juhos' - homePage = 'https://github.com/nf-core/sarek' - description = 'An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing' - mainScript = 'main.nf' - nextflowVersion = '>=20.07.0' - version = '3.0dev' -} - // Global default params, used in configs params { // Workflow flags: @@ -47,10 +37,10 @@ params { three_prime_clip_r2 = 0 trim_nextseq = 0 save_trimmed = false - single_end = false // No single end split_fastq = null // Fastq files will not be split by default // Preprocessing + aligner = 'bwa-mem' markdup_java_options = '"-Xms4000m -Xmx7g"' // Established values for markDuplicates memory consumption, see https://github.com/SciLifeLab/Sarek/pull/689 for details no_gatk_spark = null // GATK Spark implementation of their tools in local mode used by default save_bam_mapped = null // Mapped BAMs not saved @@ -107,7 +97,7 @@ params { // Base specifications // Defaults only, expecting to be overwritten - cpus = 8 + cpus = 8 max_cpus = 16 max_memory = 128.GB max_time = 240.h @@ -122,9 +112,6 @@ process.container = 'nfcore/sarek:dev' // Load base.config by default for all pipelines includeConfig 'conf/base.config' -// Load modules.config by default for all pipelines -includeConfig 'conf/modules.config' - // Load nf-core custom profiles from different Institutions try { includeConfig "${params.custom_config_base}/nfcore_custom.config" @@ -165,8 +152,8 @@ profiles { test_targeted { includeConfig 'conf/test_targeted.config' } test_tool { includeConfig 'conf/test_tool.config' } test_trimming { includeConfig 'conf/test_trimming.config' } - test_haplotypecaller { includeConfig 'conf/test_germline_variantcalling.config' } - + test_umi_tso { includeConfig 'conf/test_umi_tso.config' } + test_umi_qiaseq { includeConfig 'conf/test_umi_qiaseq.config' } } // Load genomes.config or igenomes.config @@ -176,9 +163,11 @@ if (!params.igenomes_ignore) { includeConfig 'conf/genomes.config' } -// Export this variable to prevent local Python libraries from conflicting with those in the container +// Export these variables to prevent local Python/R libraries from conflicting with those in the container env { PYTHONNOUSERSITE = 1 + R_PROFILE_USER = "/.Rprofile" + R_ENVIRON_USER = "/.Renviron" } // Capture exit codes from upstream processes when piping @@ -201,6 +190,16 @@ trace { file = "${params.tracedir}/execution_trace.txt" } +manifest { + name = 'nf-core/sarek' + author = 'Maxime Garcia, Szilveszter Juhos' + homePage = 'https://github.com/nf-core/sarek' + description = 'An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing' + mainScript = 'main.nf' + nextflowVersion = '>=20.07.1' + version = '3.0dev' +} + // Return the minimum between requirements and a maximum limit to ensure that resource requirements don't go over def check_resource(obj) { try { diff --git a/nextflow_schema.json b/nextflow_schema.json new file mode 100644 index 0000000000..577e125a78 --- /dev/null +++ b/nextflow_schema.json @@ -0,0 +1,736 @@ +{ + "$schema": "https://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/nf-core/sarek/master/nextflow_schema.json", + "title": "nf-core/sarek pipeline parameters", + "description": "An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing", + "type": "object", + "definitions": { + "input_output_options": { + "title": "Input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data and save output data.", + "required": [ + "input", + "step" + ], + "properties": { + "input": { + "type": "string", + "fa_icon": "fas fa-dna", + "description": "Path to input file.", + "help_text": "Use this to specify the location of your input TSV file. Input TSV file on `mapping`, `prepare_recalibration`, `recalibrate`, `variant_calling` and `Control-FREEC` steps\nMultiple TSV files can be specified with quotes\nWorks also with the path to a directory on `mapping` step with a single germline sample only\nAlternatively, path to VCF input file on `annotate` step\nMultiple VCF files can be specified with quotes." + }, + "step": { + "type": "string", + "default": "mapping", + "fa_icon": "fas fa-play", + "description": "Starting step", + "help_text": "(only one)", + "enum": [ + "mapping", + "prepare_recalibration", + "recalibrate", + "variant_calling", + "annotate", + "Control-FREEC" + ] + }, + "outdir": { + "type": "string", + "description": "The output directory where the results will be saved.", + "default": "./results", + "fa_icon": "fas fa-folder-open" + }, + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + } + } + }, + "main_options": { + "title": "Main options", + "type": "object", + "description": "Option used for most of the pipeline", + "default": "", + "properties": { + "tools": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-toolbox", + "description": "Specify tools to use for variant calling and/or for annotation", + "help_text": "Multiple separated with commas\n\n`DNAseq`, `DNAscope` and `TNscope` are only available with `--sentieon`", + "enum": [ + "null", + "ASCAT", + "CNVkit", + "ControlFREEC", + "FreeBayes", + "HaplotypeCaller", + "Manta", + "mpileup", + "MSIsensor", + "Mutect2", + "Strelka", + "TIDDIT", + "snpEff", + "VEP", + "merge", + "DNAseq", + "DNAscope", + "TNscope" + ] + }, + "no_intervals": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-ban", + "description": "Disable usage of intervals", + "help_text": "Intervals are part of the genome chopped up, used to speed up preprocessing and variant calling" + }, + "nucleotides_per_second": { + "type": "string", + "fa_icon": "fas fa-clock", + "description": "Estimate interval size", + "help_text": "Intervals are part of the genome chopped up, used to speed up preprocessing and variant calling" + }, + "sentieon": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-tools", + "description": "Enable Sentieon if available", + "help_text": "Adds the following options for --tools: DNAseq, DNAscope and TNscope" + }, + "skip_qc": { + "type": "string", + "fa_icon": "fas fa-forward", + "description": "Specify which QC tools to skip", + "help_text": "Multiple separated with commas\n\n`--skip_qc BaseRecalibrator` does not skip the process, but is actually just not saving the reports", + "enum": [ + "null", + "all", + "bamQC", + "BaseRecalibrator", + "BCFtools", + "Documentation", + "FastQC", + "MultiQC", + "samtools", + "vcftools", + "versions" + ] + }, + "target_bed": { + "type": "string", + "fa_icon": "fas fa-crosshairs", + "description": "Target BED file for whole exome or targeted sequencing" + } + }, + "fa_icon": "fas fa-user-cog" + }, + "trim_split_fastq": { + "title": "Trim/split FASTQ", + "type": "object", + "description": "", + "default": "", + "fa_icon": "fas fa-cut", + "properties": { + "trim_fastq": { + "type": "string", + "fa_icon": "fas fa-cut", + "description": "Run Trim Galore" + }, + "clip_r1": { + "type": "integer", + "fa_icon": "fas fa-cut", + "description": "Remove bp from the 5' end of read 1", + "help_text": "With Trim Galore" + }, + "clip_r2": { + "type": "integer", + "description": "Remove bp from the 5' end of read 5", + "help_text": "With Trim Galore" + }, + "three_prime_clip_r1": { + "type": "integer", + "fa_icon": "fas fa-cut", + "description": "Remove bp from the 3' end of read 1 AFTER adapter/quality trimming has been performed", + "help_text": "With Trim Galore" + }, + "three_prime_clip_r2": { + "type": "integer", + "fa_icon": "fas fa-cut", + "description": "Remove bp from the 3' end of read 2 AFTER adapter/quality trimming has been performed", + "help_text": "With Trim Galore" + }, + "trim_nextseq": { + "type": "integer", + "fa_icon": "fas fa-cut", + "description": "Apply the --nextseq=X option, to trim based on quality after removing poly-G tails", + "help_text": "With Trim Galore" + }, + "save_trimmed": { + "type": "string", + "fa_icon": "fas fa-save", + "description": "Specify how many reads should be contained in the split FastQ file", + "help_text": "If none specified, FastQs won't be split" + }, + "split_fastq": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-cut", + "description": "Save trimmed FastQ file intermediates", + "help_text": "If not used, FastQs won't be split" + } + } + }, + "preprocessing": { + "title": "Preprocessing", + "type": "object", + "description": "", + "default": "", + "fa_icon": "fas fa-toolbox", + "properties": { + "aligner": { + "type": "string", + "default": "bwa-mem", + "fa_icon": "fas fa-puzzle-piece" + }, + "markdup_java_options": { + "type": "string", + "default": "-Xms4000m -Xmx7g", + "fa_icon": "fas fa-memory", + "description": "Establish values for GATK MarkDuplicates memory consumption", + "help_text": "See https://github.com/SciLifeLab/Sarek/pull/689" + }, + "no_gatk_spark": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-ban", + "description": "Disable usage of GATK Spark implementation" + }, + "save_bam_mapped": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-download", + "description": "Save Mapped BAMs" + }, + "skip_markduplicates": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-forward", + "description": "Skip GATK MarkDuplicates" + } + }, + "required": [ + "aligner" + ] + }, + "variant_calling": { + "title": "Variant Calling", + "type": "object", + "description": "", + "default": "", + "fa_icon": "fas fa-toolbox", + "properties": { + "ascat_ploidy": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-wrench", + "description": "Overwrite ASCAT ploidy", + "help_text": "Requires that --ascat_purity is set" + }, + "ascat_purity": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-wrench", + "description": "Overwrite ASCAT purity", + "help_text": "Requires that --ascat_ploidy is set" + }, + "cf_coeff": { + "type": "number", + "default": 0.015, + "fa_icon": "fas fa-wrench", + "description": "Overwrite Control-FREEC coefficientOfVariation" + }, + "cf_ploidy": { + "type": "integer", + "default": 2, + "fa_icon": "fas fa-wrench", + "description": "Overwrite Control-FREEC ploidy" + }, + "cf_window": { + "type": "string", + "fa_icon": "fas fa-wrench", + "description": "Overwrite Control-FREEC window size" + }, + "no_gvcf": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-ban", + "description": "No g.vcf output from GATK HaplotypeCaller" + }, + "no_strelka_bp": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-ban", + "description": "Will not use Manta candidateSmallIndels for Strelka", + "help_text": "Not recommended by Best Practices" + }, + "pon": { + "type": "string", + "fa_icon": "fas fa-file", + "description": "Panel-of-normals VCF (bgzipped) for GATK Mutect2 / Sentieon TNscope", + "help_text": "See https://gatk.broadinstitute.org/hc/en-us/articles/360042479112-CreateSomaticPanelOfNormals-BETA" + }, + "pon_index": { + "type": "string", + "fa_icon": "fas fa-file", + "description": "Index of PON panel-of-normals VCF", + "help_text": "If none provided, will be generated automatically from the PON" + }, + "ignore_soft_clipped_bases": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-ban", + "description": "Do not analyze soft clipped bases in the reads for GATK Mutect2" + }, + "umi": { + "type": "boolean", + "default": "false", + "fa_icon": "fas fa-tape", + "description": "If provided, UMIs steps will be run to extract and annotate the reads with UMI and create consensus reads" + }, + "read_structure1": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-clipboard-list", + "description": "When processing UMIs, a read structure should always be provided for each of the fastq files.", + "help_text": "If the read does not contain any UMI, the structure will be +T (i.e. only template of any length).\nhttps://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures" + }, + "read_structure2": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-clipboard-list", + "description": "When processing UMIs, a read structure should always be provided for each of the fastq files.", + "help_text": "If the read does not contain any UMI, the structure will be +T (i.e. only template of any length).\nhttps://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures" + } + } + }, + "annotation": { + "title": "Annotation", + "type": "object", + "description": "", + "default": "", + "fa_icon": "fas fa-toolbox", + "properties": { + "annotate_tools": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-hammer", + "description": "Specify from which tools Sarek should look for VCF files to annotate", + "help_text": "Only for step `annotate`", + "enum": [ + "null", + "HaplotypeCaller", + "Manta", + "Mutect2", + "Strelka", + "TIDDIT" + ] + }, + "annotation_cache": { + "type": "boolean", + "default": "false", + "fa_icon": "fas fa-database", + "description": "Enable the use of cache for annotation", + "help_text": "To be used with `--snpeff_cache` and/or `--vep_cache`" + }, + "cadd_cache": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-database", + "description": "Enable CADD cache" + }, + "cadd_indels": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-file", + "description": "Path to CADD InDels file" + }, + "cadd_indels_tbi": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-file", + "description": "Path to CADD InDels index" + }, + "cadd_wg_snvs": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-file", + "description": "Path to CADD SNVs file" + }, + "cadd_wg_snvs_tbi": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-file", + "description": "Path to CADD SNVs index" + }, + "genesplicer": { + "type": "boolean", + "default": "false", + "fa_icon": "fas fa-gavel", + "description": "Enable genesplicer within VEP" + }, + "snpeff_cache": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-database", + "description": "Path to snpEff cache", + "help_text": "To be used with `--annotation_cache`" + }, + "vep_cache": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-database", + "description": "Path to VEP cache", + "help_text": "To be used with `--annotation_cache`" + } + } + }, + "reference_genome_options": { + "title": "Reference genome options", + "type": "object", + "fa_icon": "fas fa-dna", + "description": "Options for the reference genome indices used to align reads.", + "properties": { + "genome": { + "type": "string", + "description": "Name of iGenomes reference.", + "fa_icon": "fas fa-book", + "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`.\n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." + }, + "ac_loci": { + "type": "string", + "fa_icon": "fas fa-file" + }, + "ac_loci_gc": { + "type": "string", + "fa_icon": "fas fa-file" + }, + "bwa": { + "type": "string", + "fa_icon": "fas fa-copy" + }, + "chr_dir": { + "type": "string", + "fa_icon": "fas fa-folder-open" + }, + "chr_length": { + "type": "string", + "fa_icon": "fas fa-file" + }, + "dbsnp": { + "type": "string", + "fa_icon": "fas fa-file" + }, + "dbsnp_index": { + "type": "string", + "fa_icon": "fas fa-file" + }, + "dict": { + "type": "string", + "fa_icon": "fas fa-file" + }, + "fasta": { + "type": "string", + "fa_icon": "fas fa-font", + "description": "Path to FASTA genome file.", + "help_text": "If you have no genome reference available, the pipeline can build one using a FASTA file. This requires additional time and resources, so it's better to use a pre-build index if possible." + }, + "fasta_fai": { + "type": "string", + "fa_icon": "fas fa-file" + }, + "germline_resource": { + "type": "string", + "fa_icon": "fas fa-file" + }, + "germline_resource_index": { + "type": "string", + "fa_icon": "fas fa-file" + }, + "intervals": { + "type": "string", + "fa_icon": "fas fa-file-alt" + }, + "known_indels": { + "type": "string", + "fa_icon": "fas fa-copy" + }, + "known_indels_index": { + "type": "string", + "fa_icon": "fas fa-copy" + }, + "mappability": { + "type": "string", + "fa_icon": "fas fa-file" + }, + "snpeff_db": { + "type": "string", + "fa_icon": "fas fa-database" + }, + "species": { + "type": "string", + "fa_icon": "fas fa-microscope" + }, + "vep_cache_version": { + "type": "string", + "fa_icon": "fas fa-tag" + }, + "save_reference": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-download", + "description": "Save built references" + }, + "igenomes_base": { + "type": "string", + "description": "Directory / URL base for iGenomes references.", + "default": "s3://ngi-igenomes/igenomes/", + "fa_icon": "fas fa-cloud-download-alt", + "hidden": true + }, + "genomes_base": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-map-marker-alt", + "description": "Directory / URL base for genomes references.", + "help_text": "All files are supposed to be in the same folder", + "hidden": true + }, + "igenomes_ignore": { + "type": "boolean", + "description": "Do not load the iGenomes reference config.", + "fa_icon": "fas fa-ban", + "hidden": true, + "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + } + } + }, + "generic_options": { + "title": "Generic options", + "type": "object", + "fa_icon": "fas fa-file-import", + "description": "Less common options for the pipeline, typically set in a config file.", + "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.", + "properties": { + "help": { + "type": "boolean", + "description": "Display help text.", + "hidden": true, + "fa_icon": "fas fa-question-circle" + }, + "publish_dir_mode": { + "type": "string", + "default": "copy", + "hidden": true, + "description": "Method used to save pipeline results to output directory.", + "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", + "fa_icon": "fas fa-copy", + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "mov" + ] + }, + "name": { + "type": "string", + "description": "Workflow name.", + "fa_icon": "fas fa-fingerprint", + "hidden": true, + "help_text": "A custom name for the pipeline run. Unlike the core nextflow `-name` option with one hyphen this parameter can be reused multiple times, for example if using `-resume`. Passed through to steps such as MultiQC and used for things like report filenames and titles." + }, + "email_on_fail": { + "type": "string", + "description": "Email address for completion summary, only when pipeline fails.", + "fa_icon": "fas fa-exclamation-triangle", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", + "hidden": true, + "help_text": "This works exactly as with `--email`, except emails are only sent if the workflow is not successful." + }, + "plaintext_email": { + "type": "boolean", + "description": "Send plain-text email instead of HTML.", + "fa_icon": "fas fa-remove-format", + "hidden": true, + "help_text": "Set to receive plain-text e-mails instead of HTML formatted." + }, + "max_multiqc_email_size": { + "type": "string", + "description": "File size limit when attaching MultiQC reports to summary emails.", + "default": "25.MB", + "fa_icon": "fas fa-file-upload", + "hidden": true, + "help_text": "If file generated by pipeline exceeds the threshold, it will not be attached." + }, + "monochrome_logs": { + "type": "boolean", + "description": "Do not use coloured log outputs.", + "fa_icon": "fas fa-palette", + "hidden": true, + "help_text": "Set to disable colourful command line output and live life in monochrome." + }, + "multiqc_config": { + "type": "string", + "description": "Custom config file to supply to MultiQC.", + "fa_icon": "fas fa-cog", + "hidden": true + }, + "tracedir": { + "type": "string", + "description": "Directory to keep pipeline Nextflow logs and reports.", + "default": "${params.outdir}/pipeline_info", + "fa_icon": "fas fa-cogs", + "hidden": true + }, + "sequencing_center": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-university", + "description": "Name of sequencing center to be displayed in BAM file" + } + } + }, + "max_job_request_options": { + "title": "Max job request options", + "type": "object", + "fa_icon": "fab fa-acquisitions-incorporated", + "description": "Set the top limit for requested resources for any single job.", + "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", + "properties": { + "cpus": { + "type": "integer", + "default": 8, + "fa_icon": "fas fa-microchip" + }, + "single_cpu_mem": { + "type": "string", + "default": "7 GB", + "fa_icon": "fas fa-sd-card" + }, + "max_cpus": { + "type": "integer", + "description": "Maximum number of CPUs that can be requested for any single job.", + "default": 16, + "fa_icon": "fas fa-microchip", + "hidden": true, + "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" + }, + "max_memory": { + "type": "string", + "description": "Maximum amount of memory that can be requested for any single job.", + "default": "128.GB", + "fa_icon": "fas fa-memory", + "hidden": true, + "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" + }, + "max_time": { + "type": "string", + "description": "Maximum amount of time that can be requested for any single job.", + "default": "240.h", + "fa_icon": "far fa-clock", + "hidden": true, + "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" + } + } + }, + "institutional_config_options": { + "title": "Institutional config options", + "type": "object", + "fa_icon": "fas fa-university", + "description": "Parameters used to describe centralised config profiles. These should not be edited.", + "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", + "properties": { + "custom_config_version": { + "type": "string", + "description": "Git commit id for Institutional configs.", + "default": "master", + "hidden": true, + "fa_icon": "fas fa-users-cog", + "help_text": "Provide git commit id for custom Institutional configs hosted at `nf-core/configs`. This was implemented for reproducibility purposes. Default: `master`.\n\n```bash\n## Download and use config file with following git commit id\n--custom_config_version d52db660777c4bf36546ddb188ec530c3ada1b96\n```" + }, + "custom_config_base": { + "type": "string", + "description": "Base directory for Institutional configs.", + "default": "https://raw.githubusercontent.com/nf-core/configs/master", + "hidden": true, + "help_text": "If you're running offline, nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell nextflow where to find them with the `custom_config_base` option. For example:\n\n```bash\n## Download and unzip the config files\ncd /path/to/my/configs\nwget https://github.com/nf-core/configs/archive/master.zip\nunzip master.zip\n\n## Run the pipeline\ncd /path/to/my/data\nnextflow run /path/to/pipeline/ --custom_config_base /path/to/my/configs/configs-master/\n```\n\n> Note that the nf-core/tools helper package has a `download` command to download all required pipeline files + singularity containers + institutional configs in one go for you, to make this process easier.", + "fa_icon": "fas fa-users-cog" + }, + "hostnames": { + "type": "string", + "description": "Institutional configs hostname.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_description": { + "type": "string", + "description": "Institutional config description.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_contact": { + "type": "string", + "description": "Institutional config contact information.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_url": { + "type": "string", + "description": "Institutional config URL link.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + } + } + } + }, + "allOf": [ + { + "$ref": "#/definitions/input_output_options" + }, + { + "$ref": "#/definitions/main_options" + }, + { + "$ref": "#/definitions/trim_split_fastq" + }, + { + "$ref": "#/definitions/preprocessing" + }, + { + "$ref": "#/definitions/variant_calling" + }, + { + "$ref": "#/definitions/annotation" + }, + { + "$ref": "#/definitions/reference_genome_options" + }, + { + "$ref": "#/definitions/generic_options" + }, + { + "$ref": "#/definitions/max_job_request_options" + }, + { + "$ref": "#/definitions/institutional_config_options" + } + ] +} \ No newline at end of file From 6c9a2fa86b302b395d4278013d0a392d29071bfa Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 21 Sep 2020 16:09:15 +0200 Subject: [PATCH 118/200] remove empty space --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 4d96e55f67..9fda24ca97 100644 --- a/nextflow.config +++ b/nextflow.config @@ -97,7 +97,7 @@ params { // Base specifications // Defaults only, expecting to be overwritten - cpus = 8 + cpus = 8 max_cpus = 16 max_memory = 128.GB max_time = 240.h From d288fe05bc4e661b9da7ae53d3879a1450fc1e0e Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 21 Sep 2020 16:10:58 +0200 Subject: [PATCH 119/200] update config --- nextflow.config | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/nextflow.config b/nextflow.config index 9fda24ca97..e19049d21e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -112,6 +112,9 @@ process.container = 'nfcore/sarek:dev' // Load base.config by default for all pipelines includeConfig 'conf/base.config' +// Load modules.config by default for all pipelines +includeConfig 'conf/modules.config' + // Load nf-core custom profiles from different Institutions try { includeConfig "${params.custom_config_base}/nfcore_custom.config" @@ -145,15 +148,16 @@ profiles { singularity.autoMounts = true singularity.enabled = true } - test { includeConfig 'conf/test.config' } - test_annotation { includeConfig 'conf/test_annotation.config' } - test_no_gatk_spark { includeConfig 'conf/test_no_gatk_spark.config' } - test_split_fastq { includeConfig 'conf/test_split_fastq.config' } - test_targeted { includeConfig 'conf/test_targeted.config' } - test_tool { includeConfig 'conf/test_tool.config' } - test_trimming { includeConfig 'conf/test_trimming.config' } - test_umi_tso { includeConfig 'conf/test_umi_tso.config' } - test_umi_qiaseq { includeConfig 'conf/test_umi_qiaseq.config' } + test { includeConfig 'conf/test.config' } + test_annotation { includeConfig 'conf/test_annotation.config' } + test_no_gatk_spark { includeConfig 'conf/test_no_gatk_spark.config' } + test_split_fastq { includeConfig 'conf/test_split_fastq.config' } + test_targeted { includeConfig 'conf/test_targeted.config' } + test_tool { includeConfig 'conf/test_tool.config' } + test_trimming { includeConfig 'conf/test_trimming.config' } + test_haplotypecaller { includeConfig 'conf/test_germline_variantcalling.config' } + test_umi_tso { includeConfig 'conf/test_umi_tso.config' } + test_umi_qiaseq { includeConfig 'conf/test_umi_qiaseq.config' } } // Load genomes.config or igenomes.config From e9a3eb08b9e64e5b1e55d522689b18fb8cc61e1e Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 21 Sep 2020 16:12:51 +0200 Subject: [PATCH 120/200] update CI tests --- .github/workflows/ci.yml | 75 +++++++++++++++++++++++++++++++--------- 1 file changed, 58 insertions(+), 17 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ecaf2237ae..8138cb1002 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,30 +1,50 @@ name: nf-core CI -# This workflow is triggered on pushes and PRs to the repository. -# It runs the pipeline with the minimal test dataset to check that it completes without any syntax errors. -on: [push, pull_request] +# This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors +on: + push: + branches: + - dev + pull_request: + release: + types: [published] jobs: test: + name: Run workflow tests + # Only run on push if this is the nf-core dev branch (merged PRs) + if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/sarek') }} + runs-on: ubuntu-latest env: NXF_VER: ${{ matrix.nxf_ver }} NXF_ANSI_LOG: false - runs-on: ubuntu-latest strategy: matrix: # Nextflow versions: check pipeline minimum and current latest - nxf_ver: ['20.07.0-RC1', ''] + nxf_ver: ['20.07.1', ''] steps: - - uses: actions/checkout@v2 + - name: Check out pipeline code + uses: actions/checkout@v2 + - name: Check if Dockerfile or Conda environment changed + uses: technote-space/get-diff-action@v1 + with: + PREFIX_FILTER: | + Dockerfile + environment.yml + - name: Build new docker image + if: env.GIT_DIFF + run: docker build --no-cache . -t nfcore/sarek:dev + - name: Pull docker image + if: ${{ !env.GIT_DIFF }} + run: | + docker pull nfcore/sarek:dev + docker tag nfcore/sarek:dev nfcore/sarek:dev - name: Install Nextflow run: | wget -qO- get.nextflow.io | bash sudo mv nextflow /usr/local/bin/ - - name: Pull docker image + - name: Run pipeline with test data run: | - docker pull nfcore/sarek:dev - docker tag nfcore/sarek:dev nfcore/sarek:dev - - name: Run test - run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker + nextflow run ${GITHUB_WORKSPACE} -profile test,docker annotation: env: @@ -42,7 +62,7 @@ jobs: sudo mv nextflow /usr/local/bin/ env: # Only check Nextflow pipeline minimum version - NXF_VER: '20.07.0-RC1' + NXF_VER: '20.07.1' - name: Pull docker image run: | docker pull nfcore/sarek:dev @@ -65,7 +85,7 @@ jobs: sudo mv nextflow /usr/local/bin/ env: # Only check Nextflow pipeline minimum version - NXF_VER: '20.07.0-RC1' + NXF_VER: '20.07.1' - name: Pull docker image run: docker pull nfcore/sarek:dev - name: Get test data @@ -93,7 +113,7 @@ jobs: sudo mv nextflow /usr/local/bin/ env: # Only check Nextflow pipeline minimum version - NXF_VER: '20.07.0-RC1' + NXF_VER: '20.07.1' - name: Pull docker image run: docker pull nfcore/sarek:dev - name: Run test for minimal genomes @@ -105,7 +125,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - profile: [test_split_fastq, test_targeted, test_trimming, test_no_gatk_spark] + profile: [test_split_fastq, test_targeted, test_trimming, test_no_gatk_spark, test_umi_tso, test_umi_qiaseq] steps: - uses: actions/checkout@v2 - name: Install Nextflow @@ -114,12 +134,33 @@ jobs: sudo mv nextflow /usr/local/bin/ env: # Only check Nextflow pipeline minimum version - NXF_VER: '20.07.0-RC1' + NXF_VER: '20.07.1' - name: Pull docker image run: docker pull nfcore/sarek:dev - name: Run ${{ matrix.profile }} test run: nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.profile }},docker + aligner: + env: + NXF_ANSI_LOG: false + runs-on: ubuntu-latest + strategy: + matrix: + aligner: [bwa-mem, bwa-mem2] + steps: + - uses: actions/checkout@v2 + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + env: + # Only check Nextflow pipeline minimum version + NXF_VER: '20.07.1' + - name: Pull docker image + run: docker pull nfcore/sarek:dev + - name: Run ${{ matrix.profile }} test + run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --aligner ${{ matrix.aligner }} + tools: env: NXF_ANSI_LOG: false @@ -145,7 +186,7 @@ jobs: sudo mv nextflow /usr/local/bin/ env: # Only check Nextflow pipeline minimum version - NXF_VER: '20.07.0-RC1' + NXF_VER: '20.07.1' - name: Pull docker image run: docker pull nfcore/sarek:dev - name: Run ${{ matrix.tool }} test From 8568f725601dae7fdf32574ef2d88a3616d86411 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 21 Sep 2020 17:04:46 +0200 Subject: [PATCH 121/200] add --aligner to choose between bwa-mem and bwa-mem2 --- conf/modules.config | 14 +++++ main.nf | 14 ++++- modules/local/process/bwa_mem.nf | 44 ++++++++++++++ modules/local/subworkflow/build_indices.nf | 6 +- .../nf-core/software/bwa/index/functions.nf | 59 +++++++++++++++++++ modules/nf-core/software/bwa/index/main.nf | 31 ++++++++++ modules/nf-core/software/bwa/index/meta.yml | 52 ++++++++++++++++ 7 files changed, 217 insertions(+), 3 deletions(-) create mode 100644 modules/local/process/bwa_mem.nf create mode 100644 modules/nf-core/software/bwa/index/functions.nf create mode 100644 modules/nf-core/software/bwa/index/main.nf create mode 100644 modules/nf-core/software/bwa/index/meta.yml diff --git a/conf/modules.config b/conf/modules.config index 6d1df81e7d..98e5fb54a3 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,6 +18,20 @@ params { publish_dir = "trimgalore" publish_results = "all" } + 'bwa_index' { + args = "" + suffix = "" + publish_dir = "genome/bwa_index" + publish_results = "all" + } + 'bwa_mem' { + args = "-K 100000000 -M" + args2 = "" + extra = "" + suffix = "" + publish_dir = "" + publish_results = "all" + } 'bwamem2_index' { args = "" suffix = "" diff --git a/main.nf b/main.nf index e2c0ff5eca..1772ae9902 100644 --- a/main.nf +++ b/main.nf @@ -261,6 +261,7 @@ if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works */ include { BWAMEM2_MEM } from './modules/local/process/bwamem2_mem' +include { BWA_MEM } from './modules/local/process/bwa_mem' include { GET_SOFTWARE_VERSIONS } from './modules/local/process/get_software_versions' include { OUTPUT_DOCUMENTATION } from './modules/local/process/output_documentation' include { MERGE_BAM as MERGE_BAM_MAPPED } from './modules/local/process/merge_bam' @@ -390,9 +391,18 @@ workflow { // STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA MEM - BWAMEM2_MEM(reads_input, bwa, fasta, fai, params.modules['bwamem2_mem']) + bam_bwamem2 = Channel.empty() + bwa_mem_out = Channel.empty() - bam_bwamem2 = BWAMEM2_MEM.out + if (params.aligner == "bwa-mem") { + BWA_MEM(reads_input, bwa, fasta, fai, params.modules['bwa_mem']) + bwa_mem_out = BWA_MEM.out.bam + } else { + BWAMEM2_MEM(reads_input, bwa, fasta, fai, params.modules['bwamem2_mem']) + bam_bwamem2 = BWAMEM2_MEM.out + } + + bam_bwamem2 = bam_bwamem2.mix(bwa_mem_out) bam_bwamem2.map{ meta, bam -> patient = meta.patient diff --git a/modules/local/process/bwa_mem.nf b/modules/local/process/bwa_mem.nf new file mode 100644 index 0000000000..5a7c38ab08 --- /dev/null +++ b/modules/local/process/bwa_mem.nf @@ -0,0 +1,44 @@ +process BWA_MEM { + tag "${meta.id}" + label 'process_high' + + publishDir "${params.outdir}/bwa/${meta.sample}", + mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.endsWith('.version.txt')) null + else filename } + + container "quay.io/biocontainers/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:eabfac3657eda5818bae4090db989e3d41b01542-0" + //container "https://depot.galaxyproject.org/singularity/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:eabfac3657eda5818bae4090db989e3d41b01542-0" + + conda (params.conda ? "bioconda::bwa=0.7.17 bioconda::samtools=1.10" : null) + + input: + tuple val(meta), path(reads) + path bwa + path fasta + path fai + val options + + output: + tuple val(meta), path("*.bam"), emit: bam + path "*.version.txt" , emit: version + + script: + CN = params.sequencing_center ? "CN:${params.sequencing_center}\\t" : "" + readGroup = "@RG\\tID:${meta.run}\\t${CN}PU:${meta.run}\\tSM:${meta.sample}\\tLB:${meta.sample}\\tPL:ILLUMINA" + extra = meta.status == 1 ? "-B 3" : "" + """ + bwa mem \ + ${options.args} \ + -R \"${readGroup}\" \ + ${extra} \ + -t ${task.cpus} \ + ${fasta} ${reads} | \ + samtools sort --threads ${task.cpus} -m 2G - > ${meta.id}.bam + + # samtools index ${meta.id}.bam + + echo \$(bwa version 2>&1) > bwa.version.txt + """ +} diff --git a/modules/local/subworkflow/build_indices.nf b/modules/local/subworkflow/build_indices.nf index 2e5724c446..f31128dd9b 100644 --- a/modules/local/subworkflow/build_indices.nf +++ b/modules/local/subworkflow/build_indices.nf @@ -7,6 +7,7 @@ // And then initialize channels based on params or indexes that were just built include { BUILD_INTERVALS } from '../process/build_intervals.nf' +include { BWA_INDEX } from '../../nf-core/software/bwa/index/main.nf' include { BWAMEM2_INDEX } from '../../nf-core/software/bwamem2_index.nf' include { CREATE_INTERVALS_BED } from '../process/create_intervals_bed.nf' include { GATK_CREATESEQUENCEDICTIONARY as GATK_DICT } from '../../nf-core/software/gatk_createsequencedictionary.nf' @@ -29,8 +30,10 @@ workflow BUILD_INDICES{ main: result_bwa = Channel.empty() + version_bwa = Channel.empty() if (!(params.bwa) && params.fasta && 'mapping' in step) - result_bwa = BWAMEM2_INDEX(fasta) + if (params.aligner == "bwa-mem") (result_bwa, version_bwa) = BWA_INDEX(fasta, params.modules['bwa_index']) + else result_bwa = BWAMEM2_INDEX(fasta) result_dict = Channel.empty() if (!(params.dict) && params.fasta && !('annotate' in step) && !('controlfreec' in step)) @@ -86,6 +89,7 @@ workflow BUILD_INDICES{ emit: bwa = result_bwa + bwa_version = version_bwa dbsnp_tbi = result_dbsnp_tbi dict = result_dict fai = result_fai diff --git a/modules/nf-core/software/bwa/index/functions.nf b/modules/nf-core/software/bwa/index/functions.nf new file mode 100644 index 0000000000..b3ac38015b --- /dev/null +++ b/modules/nf-core/software/bwa/index/functions.nf @@ -0,0 +1,59 @@ +/* + * ----------------------------------------------------- + * Utility functions used in nf-core DSL2 module files + * ----------------------------------------------------- + */ + +/* + * Extract name of software tool from process name using $task.process + */ +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +/* + * Function to initialise default values and to generate a Groovy Map of available options for nf-core modules + */ +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.publish_by_id = args.publish_by_id ?: false + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files + options.suffix = args.suffix ?: '' + return options +} + +/* + * Tidy up and join elements of a list to return a path string + */ +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +/* + * Function to save/publish module results + */ +def saveFiles(Map args) { + if (!args.filename.endsWith('.version.txt')) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + if (ioptions.publish_by_id) { + path_list.add(args.publish_id) + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" + } + } + } else if (ioptions.publish_files == null) { + return "${getPathFromList(path_list)}/$args.filename" + } + } +} diff --git a/modules/nf-core/software/bwa/index/main.nf b/modules/nf-core/software/bwa/index/main.nf new file mode 100644 index 0000000000..7dbdbd3158 --- /dev/null +++ b/modules/nf-core/software/bwa/index/main.nf @@ -0,0 +1,31 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +process BWA_INDEX { + tag "$fasta" + label 'process_high' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + container "biocontainers/bwa:v0.7.17_cv1" + //container "https://depot.galaxyproject.org/singularity/bwa:0.7.17--hed695b0_7" + + conda (params.conda ? "bioconda::bwa=0.7.17" : null) + + input: + path fasta + val options + + output: + path "${fasta}.*" , emit: index + path "*.version.txt", emit: version + + script: + def software = getSoftwareName(task.process) + def ioptions = initOptions(options) + """ + bwa index $ioptions.args $fasta + echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//' > ${software}.version.txt + """ +} diff --git a/modules/nf-core/software/bwa/index/meta.yml b/modules/nf-core/software/bwa/index/meta.yml new file mode 100644 index 0000000000..a2f5b1ed66 --- /dev/null +++ b/modules/nf-core/software/bwa/index/meta.yml @@ -0,0 +1,52 @@ +name: bwa_index +description: Create BWA index for reference genome +keywords: + - index + - fasta + - genome + - reference +tools: + - bwa: + description: | + BWA is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: http://bio-bwa.sourceforge.net/ + documentation: http://www.htslib.org/doc/samtools.html + arxiv: arXiv:1303.3997 +params: + - outdir: + type: string + description: | + The pipeline's output directory. By default, the module will + output files into `$params.outdir/` + - publish_dir_mode: + type: string + description: | + Value for the Nextflow `publishDir` mode parameter. + Available: symlink, rellink, link, copy, copyNoFollow, move. + - conda: + type: boolean + description: | + Run the module with Conda using the software specified + via the `conda` directive +input: + - fasta: + type: file + description: Input genome fasta file + - options: + type: map + description: | + Groovy Map containing module options for passing command-line arguments and + output file paths. +output: + - index: + type: file + description: BWA genome index files + pattern: "*.{amb,ann,bwt,pac,sa}" + - version: + type: file + description: File containing software version + pattern: "*.{version.txt}" +authors: + - "@drpatelh" + - "@maxulysse" From ed3e680147948a3e58b2ada3934ccb08e3f65b92 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Tue, 22 Sep 2020 10:58:30 +0200 Subject: [PATCH 122/200] Apply suggestions from code review Co-authored-by: FriederikeHanssen --- docs/output.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/output.md b/docs/output.md index ce5758a240..b44f7d7ce3 100644 --- a/docs/output.md +++ b/docs/output.md @@ -11,7 +11,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Preprocessing](#preprocessing) - [Map to Reference](#map-to-reference) - - [bwa](#bwa) + - [BWA](#BWA) - [BWA-mem2](#bwa-mem2) - [Mark Duplicates](#mark-duplicates) - [GATK MarkDuplicates](#gatk-markduplicates) @@ -65,9 +65,9 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d ### Map to Reference -#### bwa +#### BWA -[bwa](https://github.com/lh3/bwa) is a software package for mapping low-divergent sequences against a large reference genome. +[BWA](https://github.com/lh3/bwa) is a software package for mapping low-divergent sequences against a large reference genome. Such files are intermediate and not kept in the final files delivered to users. From 7e111dfa955470f79db2441ec2f7d944e9f8e309 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Tue, 22 Sep 2020 11:02:26 +0200 Subject: [PATCH 123/200] Update docs/usage.md --- docs/usage.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 6a75ae6ba2..5136b5fdbf 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -926,6 +926,10 @@ Example: > **WARNING** Current indices for `bwa` in AWS iGenomes are not compatible with `bwa-mem2`. > Use `--bwa=false` to have `Sarek` build them automatically. +> **WARNING** BWA-mem2 is in active development +> Sarek might not be able to require the right amount of resources for it at the moment +> We recommend to use pre-built indexes + Example: ```bash From 7bbffd652ec584620b6468f591ab9d2052a5fde2 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Tue, 22 Sep 2020 11:16:11 +0200 Subject: [PATCH 124/200] Update docs/usage.md --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 5136b5fdbf..0d058cd8f6 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -925,7 +925,7 @@ Example: > **WARNING** Current indices for `bwa` in AWS iGenomes are not compatible with `bwa-mem2`. > Use `--bwa=false` to have `Sarek` build them automatically. - +> > **WARNING** BWA-mem2 is in active development > Sarek might not be able to require the right amount of resources for it at the moment > We recommend to use pre-built indexes From b86d5435874287a2ef299f3ee3deeff0b8776eac Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Tue, 22 Sep 2020 11:18:55 +0200 Subject: [PATCH 125/200] Apply suggestions from code review --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 0d058cd8f6..520fea039a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -925,7 +925,7 @@ Example: > **WARNING** Current indices for `bwa` in AWS iGenomes are not compatible with `bwa-mem2`. > Use `--bwa=false` to have `Sarek` build them automatically. -> +> > **WARNING** BWA-mem2 is in active development > Sarek might not be able to require the right amount of resources for it at the moment > We recommend to use pre-built indexes From b8b4da6c4d59509f077510e0c72d5cfad737ea6c Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 29 Sep 2020 13:35:55 +0200 Subject: [PATCH 126/200] get rid of params.conda warning --- nextflow.config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nextflow.config b/nextflow.config index e19049d21e..7f5452dbb6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -134,6 +134,7 @@ profiles { docker.enabled = false process.conda = "$baseDir/environment.yml" singularity.enabled = false + params.conda = true } debug { process.beforeScript = 'echo $HOSTNAME' } docker { @@ -142,11 +143,13 @@ profiles { fixOwnership = true } singularity.enabled = false + params.conda = false } singularity { docker.enabled = false singularity.autoMounts = true singularity.enabled = true + params.conda = false } test { includeConfig 'conf/test.config' } test_annotation { includeConfig 'conf/test_annotation.config' } From ef25633821cbde010de11929694286a697f15195 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 29 Sep 2020 15:45:41 +0200 Subject: [PATCH 127/200] preprocessing works with conda --- environment.yml | 18 ------- main.nf | 48 +++++++++---------- modules/local/process/build_intervals.nf | 4 ++ modules/local/process/bwa_mem.nf | 1 + modules/local/process/bwamem2_mem.nf | 3 ++ .../local/process/get_software_versions.nf | 42 ++++++++-------- modules/local/process/merge_bam.nf | 7 ++- modules/local/process/output_documentation.nf | 4 ++ modules/local/subworkflow/build_indices.nf | 4 +- modules/nf-core/software/bwa/index/main.nf | 3 +- modules/nf-core/software/bwamem2_index.nf | 4 ++ modules/nf-core/software/fastqc.nf | 2 + .../{gatk_applybqsr.nf => gatk/applybqsr.nf} | 4 ++ .../baserecalibrator.nf} | 4 ++ .../createsequencedictionary.nf} | 4 ++ .../gatherbqsrreports.nf} | 4 ++ .../genotypegvcf.nf} | 4 ++ .../haplotypecaller.nf} | 4 ++ .../markduplicates.nf} | 4 ++ modules/nf-core/software/htslib_tabix.nf | 4 +- modules/nf-core/software/multiqc.nf | 4 ++ .../{samtools_faidx.nf => samtools/faidx.nf} | 4 ++ .../{samtools_index.nf => samtools/index.nf} | 4 ++ .../{samtools_stats.nf => samtools/stats.nf} | 4 ++ modules/nf-core/software/strelka.nf | 2 - modules/nf-core/software/trimgalore.nf | 2 + nextflow.config | 5 +- 27 files changed, 122 insertions(+), 75 deletions(-) rename modules/nf-core/software/{gatk_applybqsr.nf => gatk/applybqsr.nf} (87%) rename modules/nf-core/software/{gatk_baserecalibrator.nf => gatk/baserecalibrator.nf} (90%) rename modules/nf-core/software/{gatk_createsequencedictionary.nf => gatk/createsequencedictionary.nf} (78%) rename modules/nf-core/software/{gatk_gatherbqsrreports.nf => gatk/gatherbqsrreports.nf} (87%) rename modules/nf-core/software/{gatk_genotypegvcf.nf => gatk/genotypegvcf.nf} (87%) rename modules/nf-core/software/{gatk_haplotypecaller.nf => gatk/haplotypecaller.nf} (88%) rename modules/nf-core/software/{gatk_markduplicates.nf => gatk/markduplicates.nf} (93%) rename modules/nf-core/software/{samtools_faidx.nf => samtools/faidx.nf} (71%) rename modules/nf-core/software/{samtools_index.nf => samtools/index.nf} (82%) rename modules/nf-core/software/{samtools_stats.nf => samtools/stats.nf} (73%) diff --git a/environment.yml b/environment.yml index 7ff022f2ed..34b98a864d 100644 --- a/environment.yml +++ b/environment.yml @@ -9,28 +9,10 @@ dependencies: - conda-forge::markdown=3.1.1 - conda-forge::pymdown-extensions=6.0 - conda-forge::pygments=2.5.2 - - bioconda::ascat=2.5.2 - - bioconda::bcftools=1.9 - bioconda::bwa-mem2=2.0 - - bioconda::cancerit-allelecount=4.0.2 - - bioconda::cnvkit=0.9.6 - - bioconda::control-freec=11.5 - bioconda::ensembl-vep=99.2 - - bioconda::fastqc=0.11.9 - - bioconda::freebayes=1.3.2 - - bioconda::gatk4-spark=4.1.7.0 - bioconda::genesplicer=1.0 - - bioconda::htslib=1.9 - - bioconda::manta=1.6.0 - - bioconda::msisensor=0.5 - - bioconda::multiqc=1.8 - - bioconda::qualimap=2.2.2d - bioconda::samtools=1.9 - bioconda::snpeff=4.3.1t - - bioconda::strelka=2.9.10 - - bioconda::tiddit=2.7.1 - - bioconda::trim-galore=0.6.5 - - bioconda::vcfanno=0.3.2 - - bioconda::vcftools=0.1.16 - conda-forge::pigz=2.3.4 - conda-forge::r-ggplot2=3.3.0 diff --git a/main.nf b/main.nf index 1772ae9902..13af1ed4ed 100644 --- a/main.nf +++ b/main.nf @@ -16,7 +16,7 @@ nf-core/sarek: https://nf-co.re/sarek -------------------------------------------------------------------------------- @Documentation - https://nf-co.re/sarek/docs + https://nf-co.re/sarek/latest/usage -------------------------------------------------------------------------------- */ @@ -260,8 +260,8 @@ if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works ================================================================================ */ +include { BWA_MEM as BWAMEM1_MEM } from './modules/local/process/bwa_mem' include { BWAMEM2_MEM } from './modules/local/process/bwamem2_mem' -include { BWA_MEM } from './modules/local/process/bwa_mem' include { GET_SOFTWARE_VERSIONS } from './modules/local/process/get_software_versions' include { OUTPUT_DOCUMENTATION } from './modules/local/process/output_documentation' include { MERGE_BAM as MERGE_BAM_MAPPED } from './modules/local/process/merge_bam' @@ -281,16 +281,16 @@ include { BUILD_INDICES } from './modules/local/subworkflow/build_indices' ================================================================================ */ -include { GATK_BASERECALIBRATOR as BASERECALIBRATOR } from './modules/nf-core/software/gatk_baserecalibrator' -include { GATK_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from './modules/nf-core/software/gatk_gatherbqsrreports' -include { GATK_MARKDUPLICATES as MARKDUPLICATES } from './modules/nf-core/software/gatk_markduplicates' -include { GATK_APPLYBQSR as APPLYBQSR } from './modules/nf-core/software/gatk_applybqsr' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MAPPED } from './modules/nf-core/software/samtools_index' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_RECAL } from './modules/nf-core/software/samtools_index' -include { SAMTOOLS_STATS as SAMTOOLS_STATS } from './modules/nf-core/software/samtools_stats' +include { GATK_BASERECALIBRATOR as BASERECALIBRATOR } from './modules/nf-core/software/gatk/baserecalibrator' +include { GATK_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from './modules/nf-core/software/gatk/gatherbqsrreports' +include { GATK_MARKDUPLICATES as MARKDUPLICATES } from './modules/nf-core/software/gatk/markduplicates' +include { GATK_APPLYBQSR as APPLYBQSR } from './modules/nf-core/software/gatk/applybqsr' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MAPPED } from './modules/nf-core/software/samtools/index' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_RECAL } from './modules/nf-core/software/samtools/index' +include { SAMTOOLS_STATS as SAMTOOLS_STATS } from './modules/nf-core/software/samtools/stats' include { QUALIMAP_BAMQC as BAMQC } from './modules/nf-core/software/qualimap_bamqc' -include { GATK_HAPLOTYPECALLER as HAPLOTYPECALLER } from './modules/nf-core/software/gatk_haplotypecaller' -include { GATK_GENOTYPEVCF as GENOTYPEVCF } from './modules/nf-core/software/gatk_genotypegvcf' +include { GATK_HAPLOTYPECALLER as HAPLOTYPECALLER } from './modules/nf-core/software/gatk/haplotypecaller' +include { GATK_GENOTYPEVCF as GENOTYPEVCF } from './modules/nf-core/software/gatk/genotypegvcf' include { STRELKA as STRELKA } from './modules/nf-core/software/strelka' include { MULTIQC } from './modules/nf-core/software/multiqc' @@ -391,20 +391,20 @@ workflow { // STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA MEM - bam_bwamem2 = Channel.empty() - bwa_mem_out = Channel.empty() + bam_bwamem1 = Channel.empty() + bam_bwamem2 = Channel.empty() if (params.aligner == "bwa-mem") { - BWA_MEM(reads_input, bwa, fasta, fai, params.modules['bwa_mem']) - bwa_mem_out = BWA_MEM.out.bam + BWAMEM1_MEM(reads_input, bwa, fasta, fai, params.modules['bwa_mem']) + bam_bwamem1 = BWAMEM1_MEM.out.bam } else { BWAMEM2_MEM(reads_input, bwa, fasta, fai, params.modules['bwamem2_mem']) bam_bwamem2 = BWAMEM2_MEM.out } - bam_bwamem2 = bam_bwamem2.mix(bwa_mem_out) + bam_bwa = bam_bwamem1.mix(bam_bwamem2) - bam_bwamem2.map{ meta, bam -> + bam_bwa.map{ meta, bam -> patient = meta.patient sample = meta.sample gender = meta.gender @@ -414,9 +414,9 @@ workflow { .branch{ single: it[4].size() == 1 multiple: it[4].size() > 1 - }.set{ bam_bwamem2_to_sort } + }.set{ bam_bwa_to_sort } - bam_bwamem2_single = bam_bwamem2_to_sort.single.map { + bam_bwa_single = bam_bwa_to_sort.single.map { patient, sample, gender, status, bam -> def meta = [:] @@ -429,7 +429,7 @@ workflow { [meta, bam[0]] } - bam_bwamem2_multiple = bam_bwamem2_to_sort.multiple.map { + bam_bwa_multiple = bam_bwa_to_sort.multiple.map { patient, sample, gender, status, bam -> def meta = [:] @@ -444,8 +444,8 @@ workflow { // STEP 1.5: MERGING AND INDEXING BAM FROM MULTIPLE LANES - MERGE_BAM_MAPPED(bam_bwamem2_multiple) - bam_mapped = bam_bwamem2_single.mix(MERGE_BAM_MAPPED.out.bam) + MERGE_BAM_MAPPED(bam_bwa_multiple) + bam_mapped = bam_bwa_single.mix(MERGE_BAM_MAPPED.out.bam) bam_mapped = SAMTOOLS_INDEX_MAPPED(bam_mapped, params.modules['samtools_index_mapped'], ) @@ -633,8 +633,8 @@ workflow { }.collectFile(name: 'recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") // STEP 5: QC - if (!'samtools' in skip_qc) SAMTOOLS_STATS(bam_bwamem2.mix(recal)) - if (!'bamqc' in skip_qc) BAMQC(bam_bwamem2.mix(recal), target_bed) + if (!'samtools' in skip_qc) SAMTOOLS_STATS(bam_bwa.mix(recal)) + if (!'bamqc' in skip_qc) BAMQC(bam_bwa.mix(recal), target_bed) /* ================================================================================ diff --git a/modules/local/process/build_intervals.nf b/modules/local/process/build_intervals.nf index ae35588344..01918147a7 100644 --- a/modules/local/process/build_intervals.nf +++ b/modules/local/process/build_intervals.nf @@ -4,6 +4,10 @@ process BUILD_INTERVALS { publishDir params.outdir, mode: params.publish_dir_mode, saveAs: {params.save_reference ? "reference_genome/${it}" : null } + container "nfcore/sarek:dsl2" + + conda (params.conda ? "$baseDir/environment.yml" : null) + input: path fai diff --git a/modules/local/process/bwa_mem.nf b/modules/local/process/bwa_mem.nf index 5a7c38ab08..d821d5cf0d 100644 --- a/modules/local/process/bwa_mem.nf +++ b/modules/local/process/bwa_mem.nf @@ -1,5 +1,6 @@ process BWA_MEM { tag "${meta.id}" + label 'process_high' publishDir "${params.outdir}/bwa/${meta.sample}", diff --git a/modules/local/process/bwamem2_mem.nf b/modules/local/process/bwamem2_mem.nf index 920ec8daf5..aba6188e97 100644 --- a/modules/local/process/bwamem2_mem.nf +++ b/modules/local/process/bwamem2_mem.nf @@ -1,5 +1,6 @@ process BWAMEM2_MEM { tag "${meta.id}" + label 'process_high' publishDir "${params.outdir}/bwamem2/${meta.sample}", @@ -8,6 +9,8 @@ process BWAMEM2_MEM { if (filename.endsWith('.version.txt')) null else filename } + conda (params.conda ? "bioconda::bwa-mem2=2.0 bioconda::samtools=1.10" : null) + input: tuple val(meta), path(reads) path bwa diff --git a/modules/local/process/get_software_versions.nf b/modules/local/process/get_software_versions.nf index 8c52334839..124d0d9884 100644 --- a/modules/local/process/get_software_versions.nf +++ b/modules/local/process/get_software_versions.nf @@ -14,29 +14,29 @@ process GET_SOFTWARE_VERSIONS { script: """ - alleleCounter --version &> v_allelecount.txt 2>&1 || true - bcftools --version &> v_bcftools.txt 2>&1 || true - bwa-mem2 version &> v_bwamem2.txt 2>&1 || true - cnvkit.py version &> v_cnvkit.txt 2>&1 || true - configManta.py --version &> v_manta.txt 2>&1 || true - configureStrelkaGermlineWorkflow.py --version &> v_strelka.txt 2>&1 || true - echo "${workflow.manifest.version}" &> v_pipeline.txt 2>&1 || true - echo "${workflow.nextflow.version}" &> v_nextflow.txt 2>&1 || true + // alleleCounter --version &> v_allelecount.txt 2>&1 || true + // bcftools --version &> v_bcftools.txt 2>&1 || true + // bwa-mem2 version &> v_bwamem2.txt 2>&1 || true + // cnvkit.py version &> v_cnvkit.txt 2>&1 || true + // configManta.py --version &> v_manta.txt 2>&1 || true + // configureStrelkaGermlineWorkflow.py --version &> v_strelka.txt 2>&1 || true + // echo "${workflow.manifest.version}" &> v_pipeline.txt 2>&1 || true + // echo "${workflow.nextflow.version}" &> v_nextflow.txt 2>&1 || true snpEff -version &> v_snpeff.txt 2>&1 || true - fastqc --version &> v_fastqc.txt 2>&1 || true - freebayes --version &> v_freebayes.txt 2>&1 || true - freec &> v_controlfreec.txt 2>&1 || true - gatk ApplyBQSR --help &> v_gatk.txt 2>&1 || true - msisensor &> v_msisensor.txt 2>&1 || true - multiqc --version &> v_multiqc.txt 2>&1 || true - qualimap --version &> v_qualimap.txt 2>&1 || true - R --version &> v_r.txt 2>&1 || true - R -e "library(ASCAT); help(package='ASCAT')" &> v_ascat.txt 2>&1 || true + // fastqc --version &> v_fastqc.txt 2>&1 || true + // freebayes --version &> v_freebayes.txt 2>&1 || true + // freec &> v_controlfreec.txt 2>&1 || true + // gatk ApplyBQSR --help &> v_gatk.txt 2>&1 || true + // msisensor &> v_msisensor.txt 2>&1 || true + // multiqc --version &> v_multiqc.txt 2>&1 || true + // qualimap --version &> v_qualimap.txt 2>&1 || true + // R --version &> v_r.txt 2>&1 || true + // R -e "library(ASCAT); help(package='ASCAT')" &> v_ascat.txt 2>&1 || true samtools --version &> v_samtools.txt 2>&1 || true - tiddit &> v_tiddit.txt 2>&1 || true - trim_galore -v &> v_trim_galore.txt 2>&1 || true - vcftools --version &> v_vcftools.txt 2>&1 || true - vep --help &> v_vep.txt 2>&1 || true + // tiddit &> v_tiddit.txt 2>&1 || true + // trim_galore -v &> v_trim_galore.txt 2>&1 || true + // vcftools --version &> v_vcftools.txt 2>&1 || true + // vep --help &> v_vep.txt 2>&1 || true scrape_software_versions.py &> software_versions_mqc.yaml """ } diff --git a/modules/local/process/merge_bam.nf b/modules/local/process/merge_bam.nf index 3056e12574..5f54aae475 100644 --- a/modules/local/process/merge_bam.nf +++ b/modules/local/process/merge_bam.nf @@ -2,8 +2,11 @@ process MERGE_BAM { label 'cpus_8' tag "${meta.id}" - //TODO publishDir - + + container "quay.io/biocontainers/samtools:1.10--h2e538c0_3" + + conda (params.conda ? "bioconda::samtools=1.10" : null) + input: tuple val(meta), path(bam) diff --git a/modules/local/process/output_documentation.nf b/modules/local/process/output_documentation.nf index bd3f9cdd4d..c9bc3f13eb 100644 --- a/modules/local/process/output_documentation.nf +++ b/modules/local/process/output_documentation.nf @@ -4,6 +4,10 @@ process OUTPUT_DOCUMENTATION { publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode + container "nfcore/sarek:dsl2" + + conda (params.conda ? "$baseDir/environment.yml" : null) + input: path output_docs path images diff --git a/modules/local/subworkflow/build_indices.nf b/modules/local/subworkflow/build_indices.nf index f31128dd9b..17874082e2 100644 --- a/modules/local/subworkflow/build_indices.nf +++ b/modules/local/subworkflow/build_indices.nf @@ -10,12 +10,12 @@ include { BUILD_INTERVALS } from '../process/build_in include { BWA_INDEX } from '../../nf-core/software/bwa/index/main.nf' include { BWAMEM2_INDEX } from '../../nf-core/software/bwamem2_index.nf' include { CREATE_INTERVALS_BED } from '../process/create_intervals_bed.nf' -include { GATK_CREATESEQUENCEDICTIONARY as GATK_DICT } from '../../nf-core/software/gatk_createsequencedictionary.nf' +include { GATK_CREATESEQUENCEDICTIONARY as GATK_DICT } from '../../nf-core/software/gatk/createsequencedictionary.nf' include { HTSLIB_TABIX as TABIX_DBSNP; HTSLIB_TABIX as TABIX_GERMLINE_RESOURCE; HTSLIB_TABIX as TABIX_KNOWN_INDELS; HTSLIB_TABIX as TABIX_PON;} from '../../nf-core/software/htslib_tabix' -include { SAMTOOLS_FAIDX } from '../../nf-core/software/samtools_faidx.nf' +include { SAMTOOLS_FAIDX } from '../../nf-core/software/samtools/faidx.nf' workflow BUILD_INDICES{ take: diff --git a/modules/nf-core/software/bwa/index/main.nf b/modules/nf-core/software/bwa/index/main.nf index 7dbdbd3158..c951811e2a 100644 --- a/modules/nf-core/software/bwa/index/main.nf +++ b/modules/nf-core/software/bwa/index/main.nf @@ -8,8 +8,7 @@ process BWA_INDEX { mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:'') } - container "biocontainers/bwa:v0.7.17_cv1" - //container "https://depot.galaxyproject.org/singularity/bwa:0.7.17--hed695b0_7" + container "quay.io/biocontainers/bwa:0.7.17--hed695b0_7" conda (params.conda ? "bioconda::bwa=0.7.17" : null) diff --git a/modules/nf-core/software/bwamem2_index.nf b/modules/nf-core/software/bwamem2_index.nf index 661b655b7a..3b5e6c50fb 100644 --- a/modules/nf-core/software/bwamem2_index.nf +++ b/modules/nf-core/software/bwamem2_index.nf @@ -7,6 +7,10 @@ process BWAMEM2_INDEX { input: path fasta + container "quay.io/biocontainers/bwa-mem2:2.0--he513fc3_1" + + conda (params.conda ? "bioconda::bwa-mem2=2.0" : null) + output: path "${fasta}.*" diff --git a/modules/nf-core/software/fastqc.nf b/modules/nf-core/software/fastqc.nf index 009710d53a..b9d1c091c3 100644 --- a/modules/nf-core/software/fastqc.nf +++ b/modules/nf-core/software/fastqc.nf @@ -12,6 +12,8 @@ process FASTQC { container "quay.io/biocontainers/fastqc:0.11.9--0" + conda (params.conda ? "bioconda::fastqc=0.11.9" : null) + input: tuple val(meta), path(reads) val options diff --git a/modules/nf-core/software/gatk_applybqsr.nf b/modules/nf-core/software/gatk/applybqsr.nf similarity index 87% rename from modules/nf-core/software/gatk_applybqsr.nf rename to modules/nf-core/software/gatk/applybqsr.nf index 6c5a47ed9d..45e9927fac 100644 --- a/modules/nf-core/software/gatk_applybqsr.nf +++ b/modules/nf-core/software/gatk/applybqsr.nf @@ -4,6 +4,10 @@ process GATK_APPLYBQSR { tag "${meta.id}-${interval.baseName}" + container "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" + + conda (params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null) + input: tuple val(meta), path(bam), path(bai), path(recalibrationReport), path(interval) path dict diff --git a/modules/nf-core/software/gatk_baserecalibrator.nf b/modules/nf-core/software/gatk/baserecalibrator.nf similarity index 90% rename from modules/nf-core/software/gatk_baserecalibrator.nf rename to modules/nf-core/software/gatk/baserecalibrator.nf index 58452201e4..84fe403354 100644 --- a/modules/nf-core/software/gatk_baserecalibrator.nf +++ b/modules/nf-core/software/gatk/baserecalibrator.nf @@ -3,6 +3,10 @@ process GATK_BASERECALIBRATOR { tag "${meta.id}-${interval.baseName}" + container "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" + + conda (params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null) + input: tuple val(meta), path(bam), path(bai), path(interval) path dbsnp diff --git a/modules/nf-core/software/gatk_createsequencedictionary.nf b/modules/nf-core/software/gatk/createsequencedictionary.nf similarity index 78% rename from modules/nf-core/software/gatk_createsequencedictionary.nf rename to modules/nf-core/software/gatk/createsequencedictionary.nf index 0b8baefdd9..ed902725b1 100644 --- a/modules/nf-core/software/gatk_createsequencedictionary.nf +++ b/modules/nf-core/software/gatk/createsequencedictionary.nf @@ -4,6 +4,10 @@ process GATK_CREATESEQUENCEDICTIONARY { publishDir params.outdir, mode: params.publish_dir_mode, saveAs: {params.save_reference ? "reference_genome/${it}" : null } + container "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" + + conda (params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null) + input: path fasta diff --git a/modules/nf-core/software/gatk_gatherbqsrreports.nf b/modules/nf-core/software/gatk/gatherbqsrreports.nf similarity index 87% rename from modules/nf-core/software/gatk_gatherbqsrreports.nf rename to modules/nf-core/software/gatk/gatherbqsrreports.nf index c33403e5fe..7de5005abb 100644 --- a/modules/nf-core/software/gatk_gatherbqsrreports.nf +++ b/modules/nf-core/software/gatk/gatherbqsrreports.nf @@ -9,6 +9,10 @@ process GATK_GATHERBQSRREPORTS { else "Preprocessing/${meta.sample}/Mapped/${it}" } + container "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" + + conda (params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null) + input: tuple val(meta), path(recal) diff --git a/modules/nf-core/software/gatk_genotypegvcf.nf b/modules/nf-core/software/gatk/genotypegvcf.nf similarity index 87% rename from modules/nf-core/software/gatk_genotypegvcf.nf rename to modules/nf-core/software/gatk/genotypegvcf.nf index 38a139966c..4c8f50cd16 100644 --- a/modules/nf-core/software/gatk_genotypegvcf.nf +++ b/modules/nf-core/software/gatk/genotypegvcf.nf @@ -1,6 +1,10 @@ process GATK_GENOTYPEVCF { tag "${meta.id}-${interval.baseName}" + container "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" + + conda (params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null) + input: tuple val(meta), path(interval), path(gvcf) path dbsnp diff --git a/modules/nf-core/software/gatk_haplotypecaller.nf b/modules/nf-core/software/gatk/haplotypecaller.nf similarity index 88% rename from modules/nf-core/software/gatk_haplotypecaller.nf rename to modules/nf-core/software/gatk/haplotypecaller.nf index 6971224abb..44d0adc16c 100644 --- a/modules/nf-core/software/gatk_haplotypecaller.nf +++ b/modules/nf-core/software/gatk/haplotypecaller.nf @@ -4,6 +4,10 @@ process GATK_HAPLOTYPECALLER { tag "${meta.id}-${interval.baseName}" + container "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" + + conda (params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null) + input: tuple val(meta), path(bam), path(bai), file(interval) path dbsnp diff --git a/modules/nf-core/software/gatk_markduplicates.nf b/modules/nf-core/software/gatk/markduplicates.nf similarity index 93% rename from modules/nf-core/software/gatk_markduplicates.nf rename to modules/nf-core/software/gatk/markduplicates.nf index 813b1a9fa6..4b79304d82 100644 --- a/modules/nf-core/software/gatk_markduplicates.nf +++ b/modules/nf-core/software/gatk/markduplicates.nf @@ -8,6 +8,10 @@ process GATK_MARKDUPLICATES { else "Preprocessing/${meta.sample}/DuplicatesMarked/${it}" } + container "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" + + conda (params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null) + input: tuple val(meta), path("${meta.sample}.bam"), path("${meta.sample}.bam.bai") diff --git a/modules/nf-core/software/htslib_tabix.nf b/modules/nf-core/software/htslib_tabix.nf index fef5ab0cc9..9dca431034 100644 --- a/modules/nf-core/software/htslib_tabix.nf +++ b/modules/nf-core/software/htslib_tabix.nf @@ -1,8 +1,10 @@ process HTSLIB_TABIX { - tag {vcf} + tag "${vcf}" container 'quay.io/biocontainers/tabix:0.2.6--ha92aebf_0' + conda (params.conda ? "bioconda::tabix=0.2.6" : null) + input: path vcf diff --git a/modules/nf-core/software/multiqc.nf b/modules/nf-core/software/multiqc.nf index 30689b9564..f9ddcb46f8 100644 --- a/modules/nf-core/software/multiqc.nf +++ b/modules/nf-core/software/multiqc.nf @@ -8,6 +8,10 @@ if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { process MULTIQC { publishDir "${params.outdir}/multiqc", mode: params.publish_dir_mode + container "quay.io/biocontainers/multiqc:1.9--py_1" + + conda (params.conda ? "bioconda::multiqc=1.9" : null) + input: path software_versions path fastqc_html diff --git a/modules/nf-core/software/samtools_faidx.nf b/modules/nf-core/software/samtools/faidx.nf similarity index 71% rename from modules/nf-core/software/samtools_faidx.nf rename to modules/nf-core/software/samtools/faidx.nf index 3242d7fac3..152d32cf5f 100644 --- a/modules/nf-core/software/samtools_faidx.nf +++ b/modules/nf-core/software/samtools/faidx.nf @@ -4,6 +4,10 @@ process SAMTOOLS_FAIDX { publishDir params.outdir, mode: params.publish_dir_mode, saveAs: {params.save_reference ? "reference_genome/${it}" : null } + container "quay.io/biocontainers/samtools:1.10--h2e538c0_3" + + conda (params.conda ? "bioconda::samtools=1.10" : null) + input: path fasta diff --git a/modules/nf-core/software/samtools_index.nf b/modules/nf-core/software/samtools/index.nf similarity index 82% rename from modules/nf-core/software/samtools_index.nf rename to modules/nf-core/software/samtools/index.nf index 23ef860327..5487c5dbc7 100644 --- a/modules/nf-core/software/samtools_index.nf +++ b/modules/nf-core/software/samtools/index.nf @@ -9,6 +9,10 @@ process SAMTOOLS_INDEX { else null } + container "quay.io/biocontainers/samtools:1.10--h2e538c0_3" + + conda (params.conda ? "bioconda::samtools=1.10" : null) + input: tuple val(meta), path(bam) val options diff --git a/modules/nf-core/software/samtools_stats.nf b/modules/nf-core/software/samtools/stats.nf similarity index 73% rename from modules/nf-core/software/samtools_stats.nf rename to modules/nf-core/software/samtools/stats.nf index a3cbc23d7f..6dfb321efa 100644 --- a/modules/nf-core/software/samtools_stats.nf +++ b/modules/nf-core/software/samtools/stats.nf @@ -5,6 +5,10 @@ process SAMTOOLS_STATS { publishDir "${params.outdir}/Reports/${meta.id}/SamToolsStats", mode: params.publish_dir_mode + container "quay.io/biocontainers/samtools:1.10--h2e538c0_3" + + conda (params.conda ? "bioconda::samtools=1.10" : null) + input: tuple val(meta), path(bam) diff --git a/modules/nf-core/software/strelka.nf b/modules/nf-core/software/strelka.nf index e8db199108..e7e15db507 100644 --- a/modules/nf-core/software/strelka.nf +++ b/modules/nf-core/software/strelka.nf @@ -11,8 +11,6 @@ process STRELKA { mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } - - container "quay.io/biocontainers/strelka:2.9.10--0" conda (params.conda ? "bioconda::strelka=2.9.10" : null) diff --git a/modules/nf-core/software/trimgalore.nf b/modules/nf-core/software/trimgalore.nf index ad4cc1e5d3..440d00d808 100644 --- a/modules/nf-core/software/trimgalore.nf +++ b/modules/nf-core/software/trimgalore.nf @@ -11,6 +11,8 @@ process TRIMGALORE { container "quay.io/biocontainers/trim-galore:0.6.5--0" + conda (params.conda ? "trim-galore=0.6.5" : null) + input: tuple val(meta), path(reads) val options diff --git a/nextflow.config b/nextflow.config index 7f5452dbb6..9878efde48 100644 --- a/nextflow.config +++ b/nextflow.config @@ -105,9 +105,9 @@ params { } // Container slug -// Stable releases should specify release tag (ie: `2.5.2`) +// Stable releases should specify release tag (ie: `dsl2`) // Developmental code should specify dev -process.container = 'nfcore/sarek:dev' +// process.container = 'nfcore/sarek:dsl2' // Load base.config by default for all pipelines includeConfig 'conf/base.config' @@ -132,7 +132,6 @@ try { profiles { conda { docker.enabled = false - process.conda = "$baseDir/environment.yml" singularity.enabled = false params.conda = true } From 62ec7e298dbb38377171d11d791b94f587adf343 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 29 Sep 2020 15:59:14 +0200 Subject: [PATCH 128/200] get rid of some warnings --- .github/workflows/branch.yml | 29 +++++++++++++++++++++++++---- .github/workflows/linting.yml | 22 ++++++++++++++++++++-- Dockerfile | 8 ++++++-- environment.yml | 2 +- lib/Completion.groovy | 1 - 5 files changed, 52 insertions(+), 10 deletions(-) diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index 76ce81f6aa..11d4631507 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -3,14 +3,35 @@ name: nf-core branch protection # It fails when someone tries to make a PR against the nf-core `master` branch instead of `dev` on: pull_request: - branches: - - master + branches: [master] jobs: test: runs-on: ubuntu-latest steps: - # PRs are only ok if coming from an nf-core `dev` branch or a fork `patch` branch + # PRs to the nf-core repo master branch are only ok if coming from the nf-core repo `dev` or any `patch` branches - name: Check PRs + if: github.repository == 'nf-core/sarek' run: | - { [[ $(git remote get-url origin) == *nf-core/sarek ]] && [[ ${GITHUB_HEAD_REF} = "dev" ]]; } || [[ ${GITHUB_HEAD_REF} == "patch" ]] + { [[ ${{github.event.pull_request.head.repo.full_name}} == nf-core/sarek ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] + + + # If the above check failed, post a comment on the PR explaining the failure + # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets + - name: Post PR comment + if: failure() + uses: mshick/add-pr-comment@v1 + with: + message: | + Hi @${{ github.event.pull_request.user.login }}, + + It looks like this pull-request is has been made against the ${{github.event.pull_request.head.repo.full_name}} `master` branch. + The `master` branch on nf-core repositories should always contain code from the latest release. + Because of this, PRs to `master` are only allowed if they come from the ${{github.event.pull_request.head.repo.full_name}} `dev` branch. + + You do not need to close this PR, you can change the target branch to `dev` by clicking the _"Edit"_ button at the top of this page. + + Thanks again for your contribution! + repo-token: ${{ secrets.GITHUB_TOKEN }} + allow-repeats: false + diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 1e0827a800..8e8d5bbcf7 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -33,18 +33,36 @@ jobs: nf-core: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + + - name: Check out pipeline code + uses: actions/checkout@v2 + - name: Install Nextflow run: | wget -qO- get.nextflow.io | bash sudo mv nextflow /usr/local/bin/ + - uses: actions/setup-python@v1 with: python-version: '3.6' architecture: 'x64' + - name: Install dependencies run: | python -m pip install --upgrade pip pip install nf-core + - name: Run nf-core lint - run: nf-core lint ${GITHUB_WORKSPACE} + env: + GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }} + run: nf-core -l lint_log.txt lint ${GITHUB_WORKSPACE} + + - name: Upload linting log file artifact + if: ${{ always() }} + uses: actions/upload-artifact@v2 + with: + name: linting-log-file + path: lint_log.txt + diff --git a/Dockerfile b/Dockerfile index aa89d1b9d0..5c844d76ca 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,17 @@ -FROM nfcore/base:1.9 +FROM nfcore/base:1.10.2 LABEL authors="Maxime Garcia, Szilveszter Juhos" \ description="Docker image containing all software requirements for the nf-core/sarek pipeline" # Install the conda environment COPY environment.yml / -RUN conda env create -f /environment.yml && conda clean -a +RUN conda env create --quiet -f /environment.yml && conda clean -a # Add conda installation dir to PATH (instead of doing 'conda activate') ENV PATH /opt/conda/envs/nf-core-sarek-3.0dev/bin:$PATH # Dump the details of the installed packages to a file for posterity RUN conda env export --name nf-core-sarek-3.0dev > nf-core-sarek-3.0dev.yml + +# Instruct R processes to use these empty files instead of clashing with a local version +RUN touch .Rprofile +RUN touch .Renviron diff --git a/environment.yml b/environment.yml index 34b98a864d..89afd25977 100644 --- a/environment.yml +++ b/environment.yml @@ -12,7 +12,7 @@ dependencies: - bioconda::bwa-mem2=2.0 - bioconda::ensembl-vep=99.2 - bioconda::genesplicer=1.0 - - bioconda::samtools=1.9 + - bioconda::samtools=1.10 - bioconda::snpeff=4.3.1t - conda-forge::pigz=2.3.4 - conda-forge::r-ggplot2=3.3.0 diff --git a/lib/Completion.groovy b/lib/Completion.groovy index 0a7a2b555d..996276b8e6 100644 --- a/lib/Completion.groovy +++ b/lib/Completion.groovy @@ -34,7 +34,6 @@ class Completion { email_fields['summary']['Nextflow Build'] = workflow.nextflow.build email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp - // TODO nf-core: If not using MultiQC, strip out this code (including params.max_multiqc_email_size) // On success try attach the multiqc report def mqc_report = null try { From 2bf4c8e54157d6392ffc845f291062ae7affa749 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 29 Sep 2020 16:10:31 +0200 Subject: [PATCH 129/200] use correct container --- .github/workflows/ci.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8138cb1002..c4b0e5cef8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,12 +32,12 @@ jobs: environment.yml - name: Build new docker image if: env.GIT_DIFF - run: docker build --no-cache . -t nfcore/sarek:dev + run: docker build --no-cache . -t nfcore/sarek:dsl2 - name: Pull docker image if: ${{ !env.GIT_DIFF }} run: | - docker pull nfcore/sarek:dev - docker tag nfcore/sarek:dev nfcore/sarek:dev + docker pull nfcore/sarek:dsl2 + docker tag nfcore/sarek:dsl2 nfcore/sarek:dsl2 - name: Install Nextflow run: | wget -qO- get.nextflow.io | bash @@ -65,7 +65,7 @@ jobs: NXF_VER: '20.07.1' - name: Pull docker image run: | - docker pull nfcore/sarek:dev + docker pull nfcore/sarek:dsl2 docker pull nfcore/sarek${{ matrix.tools }}:dev.${{ matrix.species }} - name: Run annotation test run: nextflow run ${GITHUB_WORKSPACE} -profile test_annotation,docker --tools ${{ matrix.tools }} @@ -87,7 +87,7 @@ jobs: # Only check Nextflow pipeline minimum version NXF_VER: '20.07.1' - name: Pull docker image - run: docker pull nfcore/sarek:dev + run: docker pull nfcore/sarek:dsl2 - name: Get test data run: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data - name: Run germline test @@ -115,7 +115,7 @@ jobs: # Only check Nextflow pipeline minimum version NXF_VER: '20.07.1' - name: Pull docker image - run: docker pull nfcore/sarek:dev + run: docker pull nfcore/sarek:dsl2 - name: Run test for minimal genomes run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --skipQC all --genome ${{ matrix.genome }} ${{ matrix.intervals }} --tools Manta,mpileup,Strelka,FreeBayes @@ -136,7 +136,7 @@ jobs: # Only check Nextflow pipeline minimum version NXF_VER: '20.07.1' - name: Pull docker image - run: docker pull nfcore/sarek:dev + run: docker pull nfcore/sarek:dsl2 - name: Run ${{ matrix.profile }} test run: nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.profile }},docker @@ -157,7 +157,7 @@ jobs: # Only check Nextflow pipeline minimum version NXF_VER: '20.07.1' - name: Pull docker image - run: docker pull nfcore/sarek:dev + run: docker pull nfcore/sarek:dsl2 - name: Run ${{ matrix.profile }} test run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --aligner ${{ matrix.aligner }} @@ -188,6 +188,6 @@ jobs: # Only check Nextflow pipeline minimum version NXF_VER: '20.07.1' - name: Pull docker image - run: docker pull nfcore/sarek:dev + run: docker pull nfcore/sarek:dsl2 - name: Run ${{ matrix.tool }} test run: nextflow run ${GITHUB_WORKSPACE} -profile test_tool,docker --tools ${{ matrix.tool }} ${{ matrix.intervals }} From bc62c27565822f9a5036ecb555bcc0616a60d5a6 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 29 Sep 2020 16:30:38 +0200 Subject: [PATCH 130/200] comment and reorganize CI tests --- .github/workflows/ci.yml | 199 ++++++++++++++++++++------------------- 1 file changed, 100 insertions(+), 99 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c4b0e5cef8..cb4a174966 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -46,14 +46,13 @@ jobs: run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker - annotation: + aligner: env: NXF_ANSI_LOG: false runs-on: ubuntu-latest strategy: matrix: - tools: [snpeff, vep] - species: [WBcel235] + aligner: [bwa-mem, bwa-mem2] steps: - uses: actions/checkout@v2 - name: Install Nextflow @@ -64,11 +63,9 @@ jobs: # Only check Nextflow pipeline minimum version NXF_VER: '20.07.1' - name: Pull docker image - run: | - docker pull nfcore/sarek:dsl2 - docker pull nfcore/sarek${{ matrix.tools }}:dev.${{ matrix.species }} - - name: Run annotation test - run: nextflow run ${GITHUB_WORKSPACE} -profile test_annotation,docker --tools ${{ matrix.tools }} + run: docker pull nfcore/sarek:dsl2 + - name: Run ${{ matrix.profile }} test + run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --aligner ${{ matrix.aligner }} germline: env: @@ -97,97 +94,101 @@ jobs: nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step recalibrate -resume nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step variantCalling - minimal: - env: - NXF_ANSI_LOG: false - runs-on: ubuntu-latest - strategy: - matrix: - genome: [smallerGRCh37, minimalGRCh37] - intervals: [--no_intervals, ''] - steps: - - uses: actions/checkout@v2 - - name: Install Nextflow - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - env: - # Only check Nextflow pipeline minimum version - NXF_VER: '20.07.1' - - name: Pull docker image - run: docker pull nfcore/sarek:dsl2 - - name: Run test for minimal genomes - run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --skipQC all --genome ${{ matrix.genome }} ${{ matrix.intervals }} --tools Manta,mpileup,Strelka,FreeBayes + # annotation: + # env: + # NXF_ANSI_LOG: false + # runs-on: ubuntu-latest + # strategy: + # matrix: + # tools: [snpeff, vep] + # species: [WBcel235] + # steps: + # - uses: actions/checkout@v2 + # - name: Install Nextflow + # run: | + # wget -qO- get.nextflow.io | bash + # sudo mv nextflow /usr/local/bin/ + # env: + # # Only check Nextflow pipeline minimum version + # NXF_VER: '20.07.1' + # - name: Pull docker image + # run: | + # docker pull nfcore/sarek:dsl2 + # docker pull nfcore/sarek${{ matrix.tools }}:dev.${{ matrix.species }} + # - name: Run annotation test + # run: nextflow run ${GITHUB_WORKSPACE} -profile test_annotation,docker --tools ${{ matrix.tools }} - profile: - env: - NXF_ANSI_LOG: false - runs-on: ubuntu-latest - strategy: - matrix: - profile: [test_split_fastq, test_targeted, test_trimming, test_no_gatk_spark, test_umi_tso, test_umi_qiaseq] - steps: - - uses: actions/checkout@v2 - - name: Install Nextflow - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - env: - # Only check Nextflow pipeline minimum version - NXF_VER: '20.07.1' - - name: Pull docker image - run: docker pull nfcore/sarek:dsl2 - - name: Run ${{ matrix.profile }} test - run: nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.profile }},docker + # minimal: + # env: + # NXF_ANSI_LOG: false + # runs-on: ubuntu-latest + # strategy: + # matrix: + # genome: [smallerGRCh37, minimalGRCh37] + # intervals: [--no_intervals, ''] + # steps: + # - uses: actions/checkout@v2 + # - name: Install Nextflow + # run: | + # wget -qO- get.nextflow.io | bash + # sudo mv nextflow /usr/local/bin/ + # env: + # # Only check Nextflow pipeline minimum version + # NXF_VER: '20.07.1' + # - name: Pull docker image + # run: docker pull nfcore/sarek:dsl2 + # - name: Run test for minimal genomes + # run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --skipQC all --genome ${{ matrix.genome }} ${{ matrix.intervals }} --tools Manta,mpileup,Strelka,FreeBayes - aligner: - env: - NXF_ANSI_LOG: false - runs-on: ubuntu-latest - strategy: - matrix: - aligner: [bwa-mem, bwa-mem2] - steps: - - uses: actions/checkout@v2 - - name: Install Nextflow - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - env: - # Only check Nextflow pipeline minimum version - NXF_VER: '20.07.1' - - name: Pull docker image - run: docker pull nfcore/sarek:dsl2 - - name: Run ${{ matrix.profile }} test - run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --aligner ${{ matrix.aligner }} + # profile: + # env: + # NXF_ANSI_LOG: false + # runs-on: ubuntu-latest + # strategy: + # matrix: + # profile: [test_split_fastq, test_targeted, test_trimming, test_no_gatk_spark, test_umi_tso, test_umi_qiaseq] + # steps: + # - uses: actions/checkout@v2 + # - name: Install Nextflow + # run: | + # wget -qO- get.nextflow.io | bash + # sudo mv nextflow /usr/local/bin/ + # env: + # # Only check Nextflow pipeline minimum version + # NXF_VER: '20.07.1' + # - name: Pull docker image + # run: docker pull nfcore/sarek:dsl2 + # - name: Run ${{ matrix.profile }} test + # run: nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.profile }},docker - tools: - env: - NXF_ANSI_LOG: false - runs-on: ubuntu-latest - strategy: - matrix: - tool: [Haplotypecaller, Freebayes, Manta, mpileup, MSIsensor, Strelka, TIDDIT] - intervals: [--no_intervals, ''] - exclude: - - tool: Manta - intervals: --no_intervals - - tool: MSIsensor - intervals: --no_intervals - - tool: Strelka - intervals: --no_intervals - - tool: TIDDIT - intervals: --no_intervals - steps: - - uses: actions/checkout@v2 - - name: Install Nextflow - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - env: - # Only check Nextflow pipeline minimum version - NXF_VER: '20.07.1' - - name: Pull docker image - run: docker pull nfcore/sarek:dsl2 - - name: Run ${{ matrix.tool }} test - run: nextflow run ${GITHUB_WORKSPACE} -profile test_tool,docker --tools ${{ matrix.tool }} ${{ matrix.intervals }} + # tools: + # env: + # NXF_ANSI_LOG: false + # runs-on: ubuntu-latest + # strategy: + # matrix: + # tool: [Haplotypecaller, Freebayes, Manta, mpileup, MSIsensor, Strelka, TIDDIT] + # intervals: [--no_intervals, ''] + # exclude: + # - tool: Manta + # intervals: --no_intervals + # - tool: MSIsensor + # intervals: --no_intervals + # - tool: Strelka + # intervals: --no_intervals + # - tool: TIDDIT + # intervals: --no_intervals + + # steps: + # - uses: actions/checkout@v2 + # - name: Install Nextflow + # run: | + # wget -qO- get.nextflow.io | bash + # sudo mv nextflow /usr/local/bin/ + # env: + # # Only check Nextflow pipeline minimum version + # NXF_VER: '20.07.1' + # - name: Pull docker image + # run: docker pull nfcore/sarek:dsl2 + # - name: Run ${{ matrix.tool }} test + # run: nextflow run ${GITHUB_WORKSPACE} -profile test_tool,docker --tools ${{ matrix.tool }} ${{ matrix.intervals }} From cddb49c0b4a55db43b4fb388b30da914c0827521 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 29 Sep 2020 17:31:28 +0200 Subject: [PATCH 131/200] fix --input for folder --- main.nf | 4 ---- modules/local/functions.nf | 42 +++++++++++++++++--------------------- 2 files changed, 19 insertions(+), 27 deletions(-) diff --git a/main.nf b/main.nf index 13af1ed4ed..c36d86390f 100644 --- a/main.nf +++ b/main.nf @@ -158,10 +158,6 @@ if (tsv_path) { log.info "Reading ${params.input} directory" log.warn "[nf-core/sarek] in ${params.input} directory, all fastqs are assuming to be from the same sample, which is assumed to be a germline one" input_sample = extract_fastq_from_dir(params.input) - (input_sample, fastq_tmp) = input_sample.into(2) - fastq_tmp.toList().subscribe onNext: { - if (it.size() == 0) exit 1, "No FASTQ files found in --input directory '${params.input}'" - } tsv_file = params.input // used in the reports } else if (tsv_path && step == 'annotate') { log.info "Annotating ${tsv_path}" diff --git a/modules/local/functions.nf b/modules/local/functions.nf index 1ef236bf85..5190656e7b 100644 --- a/modules/local/functions.nf +++ b/modules/local/functions.nf @@ -109,30 +109,26 @@ def extract_bam(tsvFile) { // Create a channel of germline FASTQs from a directory pattern: "my_samples/*/" // All FASTQ files in subdirectories are collected and emitted; // they must have _R1_ and _R2_ in their names. -def extract_fastq_from_dir(pattern) { - def fastq = Channel.create() - // a temporary channel does all the work - Channel - .fromPath(pattern, type: 'dir') - .ifEmpty { error "No directories found matching pattern '${pattern}'" } - .subscribe onNext: { sampleDir -> - // the last name of the sampleDir is assumed to be a unique sample id - sampleId = sampleDir.getFileName().toString() +// All FASTQ files are assumed to be from the same sample. +def extract_fastq_from_dir(folder) { + sample = file(folder).getFileName().toString() + + fastq = Channel.fromFilePairs(folder + '/*{_R1_,_R2_}*.fastq.gz') + .ifEmpty { error "No directories found matching folder '${folder}'" } + + fastq = fastq.map{ run, pair -> + def meta = [:] + meta.patient = sample + meta.sample = sample + meta.gender = 'ZZ' // unused + meta.status = 0 // normal (not tumor) + meta.run = run + meta.id = "${meta.sample}-${meta.run}" + read1 = pair[0] + read2 = pair[1] - for (path1 in file("${sampleDir}/**_R1_*.fastq.gz")) { - assert path1.getName().contains('_R1_') - path2 = file(path1.toString().replace('_R1_', '_R2_')) - if (!path2.exists()) error "Path '${path2}' not found" - (flowcell, lane) = flowcellLane_from_fastq(path1) - patient = sampleId - gender = 'ZZ' // unused - status = 0 // normal (not tumor) - rgId = "${flowcell}.${sampleId}.${lane}" - result = [patient, gender, status, sampleId, rgId, path1, path2] - fastq.bind(result) - } - }, onComplete: { fastq.close() } - fastq + return [meta, [read1, read2]] + } } // Channeling the TSV file containing FASTQ or BAM From 536a3461c364cadf3ecbd53bd8aa1d4e03c8e6d8 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 29 Sep 2020 17:39:50 +0200 Subject: [PATCH 132/200] fix --input for folder (this time for real) --- modules/local/functions.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/functions.nf b/modules/local/functions.nf index 5190656e7b..8e07d6ef81 100644 --- a/modules/local/functions.nf +++ b/modules/local/functions.nf @@ -119,7 +119,7 @@ def extract_fastq_from_dir(folder) { fastq = fastq.map{ run, pair -> def meta = [:] meta.patient = sample - meta.sample = sample + meta.sample = meta.patient meta.gender = 'ZZ' // unused meta.status = 0 // normal (not tumor) meta.run = run From b7d3f9e0c5bbef8c3e888d3051a3bfa4e4c8667c Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 30 Sep 2020 16:34:02 +0200 Subject: [PATCH 133/200] MAPPING as a separate workflow --- .github/workflows/ci.yml | 35 ++++-- main.nf | 139 ++++++--------------- modules/local/functions.nf | 27 ++-- modules/local/subworkflow/mapping.nf | 104 +++++++++++++++ modules/nf-core/software/multiqc.nf | 7 +- modules/nf-core/software/samtools/index.nf | 2 +- 6 files changed, 186 insertions(+), 128 deletions(-) create mode 100644 modules/local/subworkflow/mapping.nf diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cb4a174966..a5e1cb2684 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -71,9 +71,6 @@ jobs: env: NXF_ANSI_LOG: false runs-on: ubuntu-latest - strategy: - matrix: - markduplicates: [--skip_markduplicates, ''] steps: - uses: actions/checkout@v2 - name: Install Nextflow @@ -89,10 +86,34 @@ jobs: run: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data - name: Run germline test run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input data/testdata/tiny/normal --saved_bam_mapped - nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step prepare_recalibration -resume - nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step recalibrate -resume - nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step variantCalling + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --input data/testdata/tiny/normal + + # germline: + # env: + # NXF_ANSI_LOG: false + # runs-on: ubuntu-latest + # strategy: + # matrix: + # markduplicates: [--skip_markduplicates, ''] + # steps: + # - uses: actions/checkout@v2 + # - name: Install Nextflow + # run: | + # wget -qO- get.nextflow.io | bash + # sudo mv nextflow /usr/local/bin/ + # env: + # # Only check Nextflow pipeline minimum version + # NXF_VER: '20.07.1' + # - name: Pull docker image + # run: docker pull nfcore/sarek:dsl2 + # - name: Get test data + # run: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data + # - name: Run germline test + # run: | + # nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input data/testdata/tiny/normal --saved_bam_mapped + # nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step prepare_recalibration -resume + # nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step recalibrate -resume + # nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step variantCalling # annotation: # env: diff --git a/main.nf b/main.nf index c36d86390f..bc17b35952 100644 --- a/main.nf +++ b/main.nf @@ -46,6 +46,7 @@ include { extract_bam; extract_fastq; extract_fastq_from_dir; + extract_recal; has_extension } from './modules/local/functions' @@ -256,11 +257,8 @@ if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works ================================================================================ */ -include { BWA_MEM as BWAMEM1_MEM } from './modules/local/process/bwa_mem' -include { BWAMEM2_MEM } from './modules/local/process/bwamem2_mem' include { GET_SOFTWARE_VERSIONS } from './modules/local/process/get_software_versions' include { OUTPUT_DOCUMENTATION } from './modules/local/process/output_documentation' -include { MERGE_BAM as MERGE_BAM_MAPPED } from './modules/local/process/merge_bam' include { MERGE_BAM as MERGE_BAM_RECAL } from './modules/local/process/merge_bam' /* @@ -270,6 +268,7 @@ include { MERGE_BAM as MERGE_BAM_RECAL } from './modules/local/process/merge_ba */ include { BUILD_INDICES } from './modules/local/subworkflow/build_indices' +include { MAPPING } from './modules/local/subworkflow/mapping' /* ================================================================================ @@ -281,7 +280,6 @@ include { GATK_BASERECALIBRATOR as BASERECALIBRATOR } from './modules/nf-c include { GATK_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from './modules/nf-core/software/gatk/gatherbqsrreports' include { GATK_MARKDUPLICATES as MARKDUPLICATES } from './modules/nf-core/software/gatk/markduplicates' include { GATK_APPLYBQSR as APPLYBQSR } from './modules/nf-core/software/gatk/applybqsr' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MAPPED } from './modules/nf-core/software/samtools/index' include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_RECAL } from './modules/nf-core/software/samtools/index' include { SAMTOOLS_STATS as SAMTOOLS_STATS } from './modules/nf-core/software/samtools/stats' include { QUALIMAP_BAMQC as BAMQC } from './modules/nf-core/software/qualimap_bamqc' @@ -373,78 +371,40 @@ workflow { ================================================================================ */ - // STEP 0.5: QC ON READS - - QC_TRIM( - input_sample, - ('fastqc' in skip_qc), - !(params.trim_fastq), - params.modules['fastqc'], - params.modules['trimgalore'] - ) - - reads_input = QC_TRIM.out.reads - - // STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA MEM - - bam_bwamem1 = Channel.empty() - bam_bwamem2 = Channel.empty() - - if (params.aligner == "bwa-mem") { - BWAMEM1_MEM(reads_input, bwa, fasta, fai, params.modules['bwa_mem']) - bam_bwamem1 = BWAMEM1_MEM.out.bam - } else { - BWAMEM2_MEM(reads_input, bwa, fasta, fai, params.modules['bwamem2_mem']) - bam_bwamem2 = BWAMEM2_MEM.out - } - - bam_bwa = bam_bwamem1.mix(bam_bwamem2) - - bam_bwa.map{ meta, bam -> - patient = meta.patient - sample = meta.sample - gender = meta.gender - status = meta.status - [patient, sample, gender, status, bam] - }.groupTuple(by: [0,1]) - .branch{ - single: it[4].size() == 1 - multiple: it[4].size() > 1 - }.set{ bam_bwa_to_sort } - - bam_bwa_single = bam_bwa_to_sort.single.map { - patient, sample, gender, status, bam -> - - def meta = [:] - meta.patient = patient - meta.sample = sample - meta.gender = gender[0] - meta.status = status[0] - meta.id = sample - - [meta, bam[0]] - } - - bam_bwa_multiple = bam_bwa_to_sort.multiple.map { - patient, sample, gender, status, bam -> - - def meta = [:] - meta.patient = patient - meta.sample = sample - meta.gender = gender[0] - meta.status = status[0] - meta.id = sample - - [meta, bam] + qc_reports = Channel.empty() + bam_mapped = Channel.empty() + + if (step == 'mapping') { + + // STEP 0.5: QC ON READS + + QC_TRIM( + input_sample, + ('fastqc' in skip_qc), + !(params.trim_fastq), + params.modules['fastqc'], + params.modules['trimgalore'] + ) + reads_input = QC_TRIM.out.reads + + qc_reports = qc_reports.mix( + QC_TRIM.out.fastqc_html, + QC_TRIM.out.fastqc_zip, + QC_TRIM.out.trimgalore_html, + QC_TRIM.out.trimgalore_log, + QC_TRIM.out.trimgalore_zip) + + // STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA MEM + + MAPPING( + reads_input, + bwa, + fasta, + fai + ) + bam_mapped = MAPPING.out.bam_mapped } - // STEP 1.5: MERGING AND INDEXING BAM FROM MULTIPLE LANES - - MERGE_BAM_MAPPED(bam_bwa_multiple) - bam_mapped = bam_bwa_single.mix(MERGE_BAM_MAPPED.out.bam) - bam_mapped = SAMTOOLS_INDEX_MAPPED(bam_mapped, params.modules['samtools_index_mapped'], -) - // STEP 2: MARKING DUPLICATES report_markduplicates = Channel.empty() @@ -479,7 +439,7 @@ workflow { "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n" }.collectFile(name: 'duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") } else { - tsv_no_markduplicates = bam_markduplicates.map { meta, bam -> [meta] } + tsv_no_markduplicates = bam_markduplicates.map { meta, bam, bai -> [meta] } // Creating TSV files to restart from this step tsv_no_markduplicates.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> @@ -698,15 +658,10 @@ workflow { MULTIQC( GET_SOFTWARE_VERSIONS.out.yml, - QC_TRIM.out.fastqc_html.collect().ifEmpty([]), - QC_TRIM.out.fastqc_zip.collect().ifEmpty([]), - QC_TRIM.out.trimgalore_html.collect().ifEmpty([]), - QC_TRIM.out.trimgalore_log.collect().ifEmpty([]), - QC_TRIM.out.trimgalore_zip.collect().ifEmpty([]), multiqc_config, multiqc_custom_config.ifEmpty([]), - report_markduplicates.collect().ifEmpty([]), - workflow_summary) + workflow_summary, + qc_reports.collect()) } /* @@ -721,26 +676,6 @@ workflow.onComplete { Completion.summary(workflow, params, log) } -// // Creating a TSV file to restart from this step -// tsv_bam_indexed.map { idPatient, idSample -> -// gender = gender_map[idPatient] -// status = status_map[idPatient, idSample] -// bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" -// bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" -// "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" -// }.collectFile( -// name: 'mapped.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" -// ) - -// tsv_bam_indexed_sample -// .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> -// status = status_map[idPatient, idSample] -// gender = gender_map[idPatient] -// bam = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam" -// bai = "${params.outdir}/Preprocessing/${idSample}/Mapped/${idSample}.bam.bai" -// ["mapped_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] -// } - // /* // ================================================================================ // GERMLINE VARIANT CALLING diff --git a/modules/local/functions.nf b/modules/local/functions.nf index 8e07d6ef81..2fb215624e 100644 --- a/modules/local/functions.nf +++ b/modules/local/functions.nf @@ -188,20 +188,23 @@ def extract_recal(tsvFile) { .splitCsv(sep: '\t') .map { row -> check_number_of_item(row, 7) - def idPatient = row[0] - def gender = row[1] - def status = return_status(row[2].toInteger()) - def idSample = row[3] - def bamFile = return_file(row[4]) - def baiFile = return_file(row[5]) - def recalTable = return_file(row[6]) + def meta = [:] - if (!has_extension(bamFile, "bam")) exit 1, "File: ${bamFile} has the wrong extension. See --help for more information" - if (!has_extension(baiFile, "bai")) exit 1, "File: ${baiFile} has the wrong extension. See --help for more information" - if (!has_extension(recalTable, "recal.table")) exit 1, "File: ${recalTable} has the wrong extension. See --help for more information" + meta.patient = row[0] + meta.gender = row[1] + meta.status = return_status(row[2].toInteger()) + meta.sample = row[3] + meta.id = meta.sample + bam = return_file(row[4]) + bai = return_file(row[5]) + table = return_file(row[6]) - [idPatient, gender, status, idSample, bamFile, baiFile, recalTable] - } + if (!has_extension(bam, "bam")) exit 1, "File: ${bam} has the wrong extension. See --help for more information" + if (!has_extension(bai, "bai")) exit 1, "File: ${bai} has the wrong extension. See --help for more information" + if (!has_extension(table, "recal.table")) exit 1, "File: ${table} has the wrong extension. See --help for more information" + + return [meta, [bam, bai, table]] + } } // Parse first line of a FASTQ file, return the flowcell id and lane number. diff --git a/modules/local/subworkflow/mapping.nf b/modules/local/subworkflow/mapping.nf new file mode 100644 index 0000000000..dce167221c --- /dev/null +++ b/modules/local/subworkflow/mapping.nf @@ -0,0 +1,104 @@ +/* +================================================================================ + MAPPING +================================================================================ +*/ + +include { BWA_MEM as BWAMEM1_MEM } from '../process/bwa_mem' +include { BWAMEM2_MEM } from '../process/bwamem2_mem' +include { MERGE_BAM as MERGE_BAM_MAPPED } from '../process/merge_bam' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MAPPED } from '../../nf-core/software/samtools/index' + +workflow MAPPING { + take: + reads_input + bwa + fasta + fai + + main: + + bam_bwamem1 = Channel.empty() + bam_bwamem2 = Channel.empty() + + if (params.aligner == "bwa-mem") { + BWAMEM1_MEM(reads_input, bwa, fasta, fai, params.modules['bwa_mem']) + bam_bwamem1 = BWAMEM1_MEM.out.bam + } else { + BWAMEM2_MEM(reads_input, bwa, fasta, fai, params.modules['bwamem2_mem']) + bam_bwamem2 = BWAMEM2_MEM.out + } + + bam_bwa = bam_bwamem1.mix(bam_bwamem2) + + bam_bwa.map{ meta, bam -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + [patient, sample, gender, status, bam] + }.groupTuple(by: [0,1]) + .branch{ + single: it[4].size() == 1 + multiple: it[4].size() > 1 + }.set{ bam_bwa_to_sort } + + bam_bwa_single = bam_bwa_to_sort.single.map { + patient, sample, gender, status, bam -> + + def meta = [:] + meta.patient = patient + meta.sample = sample + meta.gender = gender[0] + meta.status = status[0] + meta.id = sample + + [meta, bam[0]] + } + + bam_bwa_multiple = bam_bwa_to_sort.multiple.map { + patient, sample, gender, status, bam -> + + def meta = [:] + meta.patient = patient + meta.sample = sample + meta.gender = gender[0] + meta.status = status[0] + meta.id = sample + + [meta, bam] + } + + // STEP 1.5: MERGING AND INDEXING BAM FROM MULTIPLE LANES + + MERGE_BAM_MAPPED(bam_bwa_multiple) + bam_mapped = bam_bwa_single.mix(MERGE_BAM_MAPPED.out.bam) + bam_mapped = SAMTOOLS_INDEX_MAPPED(bam_mapped, params.modules['samtools_index_mapped']) + + if (params.save_bam_mapped) { + tsv_bam_mapped = bam_mapped.map { meta, bam, bai -> [meta] } + // Creating TSV files to restart from this step + tsv_bam_mapped.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> + patient = meta.patient[0] + sample = meta.sample[0] + gender = meta.gender[0] + status = meta.status[0] + bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam" + bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam.bai" + ["mapped_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n"] + } + + tsv_bam_mapped.map { meta -> + patient = meta.patient[0] + sample = meta.sample[0] + gender = meta.gender[0] + status = meta.status[0] + bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam" + bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam.bai" + "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n" + }.collectFile(name: "mapped.tsv", sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + } + + emit: + bam_mapped = bam_mapped +} diff --git a/modules/nf-core/software/multiqc.nf b/modules/nf-core/software/multiqc.nf index f9ddcb46f8..439a68c14e 100644 --- a/modules/nf-core/software/multiqc.nf +++ b/modules/nf-core/software/multiqc.nf @@ -14,15 +14,10 @@ process MULTIQC { input: path software_versions - path fastqc_html - path fastqc_zip - path trim_galore_html - path trim_galore_log - path trim_galore_zip path multiqc_config path multiqc_custom_config - path report_markduplicates val workflow_summary + path qc_reports output: path "*multiqc_report.html" diff --git a/modules/nf-core/software/samtools/index.nf b/modules/nf-core/software/samtools/index.nf index 5487c5dbc7..ed832c0205 100644 --- a/modules/nf-core/software/samtools/index.nf +++ b/modules/nf-core/software/samtools/index.nf @@ -5,7 +5,7 @@ process SAMTOOLS_INDEX { publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { - if (save_bam_mapped) "Preprocessing/${meta.sample}/Mapped/${it}" + if (params.save_bam_mapped) "Preprocessing/${meta.sample}/Mapped/${it}" else null } From b675300d242516ee5dd4c8e5310c0792c9cdee04 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 30 Sep 2020 16:57:21 +0200 Subject: [PATCH 134/200] MARKDUPLICATES as a module --- .github/workflows/ci.yml | 5 +- main.nf | 124 ++++++-------------- modules/local/subworkflow/markduplicates.nf | 76 ++++++++++++ 3 files changed, 115 insertions(+), 90 deletions(-) create mode 100644 modules/local/subworkflow/markduplicates.nf diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a5e1cb2684..12059abed2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -71,6 +71,9 @@ jobs: env: NXF_ANSI_LOG: false runs-on: ubuntu-latest + strategy: + matrix: + markduplicates: [--skip_markduplicates, ''] steps: - uses: actions/checkout@v2 - name: Install Nextflow @@ -86,7 +89,7 @@ jobs: run: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data - name: Run germline test run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --input data/testdata/tiny/normal + nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input data/testdata/tiny/normal --saved_bam_mapped # germline: # env: diff --git a/main.nf b/main.nf index bc17b35952..e1779371fe 100644 --- a/main.nf +++ b/main.nf @@ -267,8 +267,9 @@ include { MERGE_BAM as MERGE_BAM_RECAL } from './modules/local/process/merge_ba ================================================================================ */ -include { BUILD_INDICES } from './modules/local/subworkflow/build_indices' -include { MAPPING } from './modules/local/subworkflow/mapping' +include { BUILD_INDICES } from './modules/local/subworkflow/build_indices' +include { MAPPING } from './modules/local/subworkflow/mapping' +include { MARKDUPLICATES } from './modules/local/subworkflow/markduplicates' /* ================================================================================ @@ -278,7 +279,6 @@ include { MAPPING } from './modules/local/subworkflow/mapping' include { GATK_BASERECALIBRATOR as BASERECALIBRATOR } from './modules/nf-core/software/gatk/baserecalibrator' include { GATK_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from './modules/nf-core/software/gatk/gatherbqsrreports' -include { GATK_MARKDUPLICATES as MARKDUPLICATES } from './modules/nf-core/software/gatk/markduplicates' include { GATK_APPLYBQSR as APPLYBQSR } from './modules/nf-core/software/gatk/applybqsr' include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_RECAL } from './modules/nf-core/software/samtools/index' include { SAMTOOLS_STATS as SAMTOOLS_STATS } from './modules/nf-core/software/samtools/stats' @@ -374,96 +374,42 @@ workflow { qc_reports = Channel.empty() bam_mapped = Channel.empty() - if (step == 'mapping') { - - // STEP 0.5: QC ON READS - - QC_TRIM( - input_sample, - ('fastqc' in skip_qc), - !(params.trim_fastq), - params.modules['fastqc'], - params.modules['trimgalore'] - ) - reads_input = QC_TRIM.out.reads - - qc_reports = qc_reports.mix( - QC_TRIM.out.fastqc_html, - QC_TRIM.out.fastqc_zip, - QC_TRIM.out.trimgalore_html, - QC_TRIM.out.trimgalore_log, - QC_TRIM.out.trimgalore_zip) - - // STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA MEM - - MAPPING( - reads_input, - bwa, - fasta, - fai - ) - bam_mapped = MAPPING.out.bam_mapped - } + if (step == 'mapping') input_reads = input_sample - // STEP 2: MARKING DUPLICATES + // STEP 0.5: QC ON READS - report_markduplicates = Channel.empty() - bam_markduplicates = bam_mapped - - if (!params.skip_markduplicates) { - MARKDUPLICATES(bam_mapped) - report_markduplicates = MARKDUPLICATES.out.report - bam_markduplicates = MARKDUPLICATES.out.bam - tsv_markduplicates = MARKDUPLICATES.out.tsv - - // Creating TSV files to restart from this step - tsv_markduplicates.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> - patient = meta.patient - sample = meta.sample - gender = meta.gender - status = meta.status - bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" - bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" - table = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.recal.table" - ["duplicates_marked_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n"] - } + QC_TRIM( + input_sample, + ('fastqc' in skip_qc), + !(params.trim_fastq), + params.modules['fastqc'], + params.modules['trimgalore'] + ) + reads_input = QC_TRIM.out.reads - tsv_markduplicates.map { meta -> - patient = meta.patient - sample = meta.sample - gender = meta.gender - status = meta.status - bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" - bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" - table = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.recal.table" - "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n" - }.collectFile(name: 'duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") - } else { - tsv_no_markduplicates = bam_markduplicates.map { meta, bam, bai -> [meta] } - - // Creating TSV files to restart from this step - tsv_no_markduplicates.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> - patient = meta.patient - sample = meta.sample - gender = meta.gender - status = meta.status - bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.md.bam" - bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.md.bam.bai" - table = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.recal.table" - ["mapped_no_duplicates_marked_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n"] - } + qc_reports = qc_reports.mix( + QC_TRIM.out.fastqc_html, + QC_TRIM.out.fastqc_zip, + QC_TRIM.out.trimgalore_html, + QC_TRIM.out.trimgalore_log, + QC_TRIM.out.trimgalore_zip) - tsv_no_markduplicates.map { meta -> - patient = meta.patient - sample = meta.sample - gender = meta.gender - status = meta.status - bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.md.bam" - bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.md.bam.bai" - table = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.recal.table" - "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n" - }.collectFile(name: 'mapped_no_duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") - } + // STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA MEM + + MAPPING( + reads_input, + bwa, + fasta, + fai + ) + + bam_mapped = MAPPING.out.bam_mapped + + // STEP 2: MARKING DUPLICATES + + MARKDUPLICATES(bam_mapped) + + bam_markduplicates = MARKDUPLICATES.out.bam // STEP 3: CREATING RECALIBRATION TABLES bam_baserecalibrator = bam_markduplicates.combine(intervals) diff --git a/modules/local/subworkflow/markduplicates.nf b/modules/local/subworkflow/markduplicates.nf new file mode 100644 index 0000000000..95961a1944 --- /dev/null +++ b/modules/local/subworkflow/markduplicates.nf @@ -0,0 +1,76 @@ +/* +================================================================================ + MAPPING +================================================================================ +*/ + +include { GATK_MARKDUPLICATES } from '../../nf-core/software/gatk/markduplicates' + +workflow MARKDUPLICATES { + take: + bam_mapped + + main: + + bam_markduplicates = bam_mapped + report_markduplicates = Channel.empty() + + if (!params.skip_markduplicates) { + GATK_MARKDUPLICATES(bam_mapped) + report_markduplicates = GATK_MARKDUPLICATES.out.report + bam_markduplicates = GATK_MARKDUPLICATES.out.bam + tsv_markduplicates = GATK_MARKDUPLICATES.out.tsv + + // Creating TSV files to restart from this step + tsv_markduplicates.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" + bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" + table = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.recal.table" + ["duplicates_marked_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n"] + } + + tsv_markduplicates.map { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" + bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" + table = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.recal.table" + "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n" + }.collectFile(name: 'duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + } else { + tsv_no_markduplicates = bam_markduplicates.map { meta, bam, bai -> [meta] } + + // Creating TSV files to restart from this step + tsv_no_markduplicates.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam" + bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam.bai" + table = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.recal.table" + ["mapped_no_duplicates_marked_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n"] + } + + tsv_no_markduplicates.map { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam" + bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam.bai" + table = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.recal.table" + "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n" + }.collectFile(name: 'mapped_no_duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + } + + emit: + bam = bam_markduplicates + report = report_markduplicates +} From 3149c4c0a8eaee5c9f43e34ebd0222a3a361d038 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 30 Sep 2020 17:02:34 +0200 Subject: [PATCH 135/200] fix: process without container --- modules/local/process/bwamem2_mem.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/local/process/bwamem2_mem.nf b/modules/local/process/bwamem2_mem.nf index aba6188e97..bcca32a2c3 100644 --- a/modules/local/process/bwamem2_mem.nf +++ b/modules/local/process/bwamem2_mem.nf @@ -9,6 +9,8 @@ process BWAMEM2_MEM { if (filename.endsWith('.version.txt')) null else filename } + container "nfcore/sarek:dsl2" + conda (params.conda ? "bioconda::bwa-mem2=2.0 bioconda::samtools=1.10" : null) input: From 384c9b6d26d05a2f1bbef007b303dbeb6a2497d3 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 30 Sep 2020 17:16:33 +0200 Subject: [PATCH 136/200] fix: --step prepare_recalibration --- .github/workflows/ci.yml | 1 + main.nf | 9 ++++++--- modules/local/functions.nf | 29 ++++++++++++++++------------- 3 files changed, 23 insertions(+), 16 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 12059abed2..be28b63de8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -90,6 +90,7 @@ jobs: - name: Run germline test run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input data/testdata/tiny/normal --saved_bam_mapped + nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step prepare_recalibration -resume # germline: # env: diff --git a/main.nf b/main.nf index e1779371fe..03105d6fb9 100644 --- a/main.nf +++ b/main.nf @@ -371,15 +371,16 @@ workflow { ================================================================================ */ - qc_reports = Channel.empty() - bam_mapped = Channel.empty() + qc_reports = Channel.empty() + bam_mapped = Channel.empty() if (step == 'mapping') input_reads = input_sample + else input_reads = Channel.empty() // STEP 0.5: QC ON READS QC_TRIM( - input_sample, + input_reads, ('fastqc' in skip_qc), !(params.trim_fastq), params.modules['fastqc'], @@ -411,6 +412,8 @@ workflow { bam_markduplicates = MARKDUPLICATES.out.bam + if (step == 'preparerecalibration') bam_markduplicates = input_sample + // STEP 3: CREATING RECALIBRATION TABLES bam_baserecalibrator = bam_markduplicates.combine(intervals) BASERECALIBRATOR(bam_baserecalibrator, dbsnp, dbsnp_tbi, dict, fai, fasta, known_indels, known_indels_tbi) diff --git a/modules/local/functions.nf b/modules/local/functions.nf index 2fb215624e..124c452a20 100644 --- a/modules/local/functions.nf +++ b/modules/local/functions.nf @@ -92,17 +92,20 @@ def extract_bam(tsvFile) { .splitCsv(sep: '\t') .map { row -> check_number_of_item(row, 6) - def idPatient = row[0] - def gender = row[1] - def status = return_status(row[2].toInteger()) - def idSample = row[3] - def bamFile = return_file(row[4]) - def baiFile = return_file(row[5]) + def meta = [:] - if (!has_extension(bamFile, "bam")) exit 1, "File: ${bamFile} has the wrong extension. See --help for more information" - if (!has_extension(baiFile, "bai")) exit 1, "File: ${baiFile} has the wrong extension. See --help for more information" + meta.patient = row[0] + meta.gender = row[1] + meta.status = return_status(row[2].toInteger()) + meta.sample = row[3] + meta.id = meta.sample + def bam = return_file(row[4]) + def bai = return_file(row[5]) + + if (!has_extension(bam, "bam")) exit 1, "File: ${bam} has the wrong extension. See --help for more information" + if (!has_extension(bai, "bai")) exit 1, "File: ${bai} has the wrong extension. See --help for more information" - return [idPatient, gender, status, idSample, bamFile, baiFile] + return [meta, bam, bai] } } @@ -124,8 +127,8 @@ def extract_fastq_from_dir(folder) { meta.status = 0 // normal (not tumor) meta.run = run meta.id = "${meta.sample}-${meta.run}" - read1 = pair[0] - read2 = pair[1] + def read1 = pair[0] + def read2 = pair[1] return [meta, [read1, read2]] } @@ -145,8 +148,8 @@ def extract_fastq(tsvFile) { meta.sample = row[3] meta.run = row[4] meta.id = "${meta.sample}-${meta.run}" - read1 = return_file(row[5]) - read2 = "null" + def read1 = return_file(row[5]) + def read2 = "null" if (has_extension(read1, "fastq.gz") || has_extension(read1, "fq.gz") || has_extension(read1, "fastq") || has_extension(read1, "fq")) { check_number_of_item(row, 7) read2 = return_file(row[6]) From 7c38825293799b3d4199d5a0a30585f166f297e0 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 30 Sep 2020 17:42:23 +0200 Subject: [PATCH 137/200] fix: --skip_markduplicates --- .github/workflows/ci.yml | 1 + main.nf | 4 +++- modules/local/functions.nf | 8 ++++---- modules/local/subworkflow/markduplicates.nf | 16 ++++++++-------- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index be28b63de8..c48badadf0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -91,6 +91,7 @@ jobs: run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input data/testdata/tiny/normal --saved_bam_mapped nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step prepare_recalibration -resume + nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step recalibrate -resume # germline: # env: diff --git a/main.nf b/main.nf index 03105d6fb9..a90e983446 100644 --- a/main.nf +++ b/main.nf @@ -377,7 +377,7 @@ workflow { if (step == 'mapping') input_reads = input_sample else input_reads = Channel.empty() - // STEP 0.5: QC ON READS + // STEP 0.5: QC & TRIM IF SPECIFIED ON READS QC_TRIM( input_reads, @@ -473,6 +473,8 @@ workflow { // STEP 4: RECALIBRATING bam_applybqsr = bam_markduplicates.join(table_bqsr) + if (step == 'recalibrate') bam_applybqsr = input_sample + bam_applybqsr = bam_applybqsr.combine(intervals) APPLYBQSR(bam_applybqsr, dict, fasta, fai) diff --git a/modules/local/functions.nf b/modules/local/functions.nf index 124c452a20..9b8717a9b5 100644 --- a/modules/local/functions.nf +++ b/modules/local/functions.nf @@ -198,15 +198,15 @@ def extract_recal(tsvFile) { meta.status = return_status(row[2].toInteger()) meta.sample = row[3] meta.id = meta.sample - bam = return_file(row[4]) - bai = return_file(row[5]) - table = return_file(row[6]) + def bam = return_file(row[4]) + def bai = return_file(row[5]) + def table = return_file(row[6]) if (!has_extension(bam, "bam")) exit 1, "File: ${bam} has the wrong extension. See --help for more information" if (!has_extension(bai, "bai")) exit 1, "File: ${bai} has the wrong extension. See --help for more information" if (!has_extension(table, "recal.table")) exit 1, "File: ${table} has the wrong extension. See --help for more information" - return [meta, [bam, bai, table]] + return [meta, bam, bai, table] } } diff --git a/modules/local/subworkflow/markduplicates.nf b/modules/local/subworkflow/markduplicates.nf index 95961a1944..acd346f10e 100644 --- a/modules/local/subworkflow/markduplicates.nf +++ b/modules/local/subworkflow/markduplicates.nf @@ -48,10 +48,10 @@ workflow MARKDUPLICATES { // Creating TSV files to restart from this step tsv_no_markduplicates.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> - patient = meta.patient - sample = meta.sample - gender = meta.gender - status = meta.status + patient = meta.patient[0] + sample = meta.sample[0] + gender = meta.gender[0] + status = meta.status[0] bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam" bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam.bai" table = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.recal.table" @@ -59,10 +59,10 @@ workflow MARKDUPLICATES { } tsv_no_markduplicates.map { meta -> - patient = meta.patient - sample = meta.sample - gender = meta.gender - status = meta.status + patient = meta.patient[0] + sample = meta.sample[0] + gender = meta.gender[0] + status = meta.status[0] bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam" bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam.bai" table = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.recal.table" From 93a15ec09adf9b29e9677ef00d01f278f243c5e7 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 30 Sep 2020 17:59:19 +0200 Subject: [PATCH 138/200] fix: typo --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c48badadf0..0210126eff 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -89,7 +89,7 @@ jobs: run: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data - name: Run germline test run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input data/testdata/tiny/normal --saved_bam_mapped + nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input data/testdata/tiny/normal --save_bam_mapped nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step prepare_recalibration -resume nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step recalibrate -resume @@ -115,7 +115,7 @@ jobs: # run: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data # - name: Run germline test # run: | - # nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input data/testdata/tiny/normal --saved_bam_mapped + # nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input data/testdata/tiny/normal --save_bam_mapped # nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step prepare_recalibration -resume # nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step recalibrate -resume # nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step variantCalling From 6d1f534c2069f37fc0cbdc772568453b8f9879e9 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 1 Oct 2020 14:45:06 +0200 Subject: [PATCH 139/200] add: PREPARE_RECALIBRATION as a module --- main.nf | 61 ++------------ modules/local/subworkflow/markduplicates.nf | 2 +- .../subworkflow/prepare_recalibration.nf | 81 +++++++++++++++++++ 3 files changed, 88 insertions(+), 56 deletions(-) create mode 100644 modules/local/subworkflow/prepare_recalibration.nf diff --git a/main.nf b/main.nf index a90e983446..960c4aed6f 100644 --- a/main.nf +++ b/main.nf @@ -267,9 +267,10 @@ include { MERGE_BAM as MERGE_BAM_RECAL } from './modules/local/process/merge_ba ================================================================================ */ -include { BUILD_INDICES } from './modules/local/subworkflow/build_indices' -include { MAPPING } from './modules/local/subworkflow/mapping' -include { MARKDUPLICATES } from './modules/local/subworkflow/markduplicates' +include { BUILD_INDICES } from './modules/local/subworkflow/build_indices' +include { MAPPING } from './modules/local/subworkflow/mapping' +include { MARKDUPLICATES } from './modules/local/subworkflow/markduplicates' +include { PREPARE_RECALIBRATION } from './modules/local/subworkflow/prepare_recalibration' /* ================================================================================ @@ -415,60 +416,10 @@ workflow { if (step == 'preparerecalibration') bam_markduplicates = input_sample // STEP 3: CREATING RECALIBRATION TABLES - bam_baserecalibrator = bam_markduplicates.combine(intervals) - BASERECALIBRATOR(bam_baserecalibrator, dbsnp, dbsnp_tbi, dict, fai, fasta, known_indels, known_indels_tbi) - table_bqsr = BASERECALIBRATOR.out.report - tsv_bqsr = BASERECALIBRATOR.out.tsv - // STEP 3.5: MERGING RECALIBRATION TABLES - if (!params.no_intervals) { - BASERECALIBRATOR.out.report.map{ meta, table -> - patient = meta.patient - sample = meta.sample - gender = meta.gender - status = meta.status - [patient, sample, gender, status, table] - }.groupTuple(by: [0,1]).set{ recaltable } - - recaltable = recaltable.map { - patient, sample, gender, status, recal -> - - def meta = [:] - meta.patient = patient - meta.sample = sample - meta.gender = gender[0] - meta.status = status[0] - meta.id = sample - - [meta, recal] - } + PREPARE_RECALIBRATION(bam_markduplicates, intervals, dbsnp, dbsnp_tbi, dict, fai, fasta, known_indels, known_indels_tbi) - GATHERBQSRREPORTS(recaltable) - table_bqsr = GATHERBQSRREPORTS.out.table - tsv_bqsr = GATHERBQSRREPORTS.out.tsv - - } - - // Creating TSV files to restart from this step - tsv_bqsr.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> - patient = meta.patient - sample = meta.sample - gender = meta.gender - status = meta.status - bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" - bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" - ["duplicates_marked_no_table_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n"] - } - - tsv_bqsr.map { meta -> - patient = meta.patient - sample = meta.sample - gender = meta.gender - status = meta.status - bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" - bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" - "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n" - }.collectFile(name: 'duplicates_marked_no_table.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + table_bqsr = PREPARE_RECALIBRATION.out.table_bqsr // STEP 4: RECALIBRATING bam_applybqsr = bam_markduplicates.join(table_bqsr) diff --git a/modules/local/subworkflow/markduplicates.nf b/modules/local/subworkflow/markduplicates.nf index acd346f10e..df423df402 100644 --- a/modules/local/subworkflow/markduplicates.nf +++ b/modules/local/subworkflow/markduplicates.nf @@ -1,6 +1,6 @@ /* ================================================================================ - MAPPING + MARKDUPLICATES ================================================================================ */ diff --git a/modules/local/subworkflow/prepare_recalibration.nf b/modules/local/subworkflow/prepare_recalibration.nf new file mode 100644 index 0000000000..d2c90b93cf --- /dev/null +++ b/modules/local/subworkflow/prepare_recalibration.nf @@ -0,0 +1,81 @@ +/* +================================================================================ + PREPARE RECALIBRATION +================================================================================ +*/ + +include { GATK_BASERECALIBRATOR as BASERECALIBRATOR } from '../../nf-core/software/gatk/baserecalibrator' +include { GATK_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from '../../nf-core/software/gatk/gatherbqsrreports' + +workflow PREPARE_RECALIBRATION { + take: + bam_markduplicates + intervals + dbsnp + dbsnp_tbi + dict + fai + fasta + known_indels + known_indels_tbi + + main: + + bam_baserecalibrator = bam_markduplicates.combine(intervals) + BASERECALIBRATOR(bam_baserecalibrator, dbsnp, dbsnp_tbi, dict, fai, fasta, known_indels, known_indels_tbi) + table_bqsr = BASERECALIBRATOR.out.report + tsv_bqsr = BASERECALIBRATOR.out.tsv + + // STEP 3.5: MERGING RECALIBRATION TABLES + if (!params.no_intervals) { + BASERECALIBRATOR.out.report.map{ meta, table -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + [patient, sample, gender, status, table] + }.groupTuple(by: [0,1]).set{ recaltable } + + recaltable = recaltable.map { + patient, sample, gender, status, recal -> + + def meta = [:] + meta.patient = patient + meta.sample = sample + meta.gender = gender[0] + meta.status = status[0] + meta.id = sample + + [meta, recal] + } + + GATHERBQSRREPORTS(recaltable) + table_bqsr = GATHERBQSRREPORTS.out.table + tsv_bqsr = GATHERBQSRREPORTS.out.tsv + + } + + // Creating TSV files to restart from this step + tsv_bqsr.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" + bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" + ["duplicates_marked_no_table_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n"] + } + + tsv_bqsr.map { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" + bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" + "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n" + }.collectFile(name: 'duplicates_marked_no_table.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + + emit: + table_bqsr = table_bqsr +} From 9c563c099cd663a28dee6e7d22821f06af48d86c Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 5 Oct 2020 14:16:01 +0200 Subject: [PATCH 140/200] fix: test_annotation profile fix from dev --- conf/test_annotation.config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/conf/test_annotation.config b/conf/test_annotation.config index 6f4e165c35..bbcf38cb88 100644 --- a/conf/test_annotation.config +++ b/conf/test_annotation.config @@ -11,4 +11,6 @@ includeConfig 'test.config' params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/sarek/testdata/vcf/Strelka_1234N_variants.vcf.gz' + genome = 'WBcel235' + igenomes_ignore = false } \ No newline at end of file From 334ce4aa6013b592302cfe7591a67a04763d861b Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 5 Oct 2020 19:45:10 +0200 Subject: [PATCH 141/200] minor code polish --- lib/Schema.groovy | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/Schema.groovy b/lib/Schema.groovy index 6b7432fa4f..4c7215e699 100644 --- a/lib/Schema.groovy +++ b/lib/Schema.groovy @@ -82,9 +82,8 @@ class JSON { def Map summary = [:] if (workflow.revision) summary['Pipeline Release'] = workflow.revision summary['Run Name'] = run_name ?: workflow.runName - summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" - if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" - + summary['Max Resources'] = "${params.max_memory} memory, ${params.max_cpus} cpus, ${params.max_time} time per job" + if (workflow.containerEngine) summary['Container'] = "${workflow.containerEngine} - ${workflow.container}" summary['Input'] = params.input summary['Step'] = step summary['Genome'] = params.genome From 4bbc4013ada96e291c8285d7b7cab1570fe010c5 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 5 Oct 2020 19:45:41 +0200 Subject: [PATCH 142/200] fix: restore --step variantcalling ci test --- .github/workflows/ci.yml | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0210126eff..f414ec3425 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -92,33 +92,7 @@ jobs: nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input data/testdata/tiny/normal --save_bam_mapped nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step prepare_recalibration -resume nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step recalibrate -resume - - # germline: - # env: - # NXF_ANSI_LOG: false - # runs-on: ubuntu-latest - # strategy: - # matrix: - # markduplicates: [--skip_markduplicates, ''] - # steps: - # - uses: actions/checkout@v2 - # - name: Install Nextflow - # run: | - # wget -qO- get.nextflow.io | bash - # sudo mv nextflow /usr/local/bin/ - # env: - # # Only check Nextflow pipeline minimum version - # NXF_VER: '20.07.1' - # - name: Pull docker image - # run: docker pull nfcore/sarek:dsl2 - # - name: Get test data - # run: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data - # - name: Run germline test - # run: | - # nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input data/testdata/tiny/normal --save_bam_mapped - # nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step prepare_recalibration -resume - # nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step recalibrate -resume - # nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step variantCalling + nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step variantCalling # annotation: # env: From b9f4d0834db3d9ac01ccb53f900bc3b51b999f43 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 5 Oct 2020 19:46:12 +0200 Subject: [PATCH 143/200] update modules.config --- conf/modules.config | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 98e5fb54a3..75cfdfcd54 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -32,6 +32,13 @@ params { publish_dir = "" publish_results = "all" } + 'strelka' { + args = "" + extra = "" + suffix = "" + publish_dir = "" + publish_results = "all" + } 'bwamem2_index' { args = "" suffix = "" @@ -46,16 +53,24 @@ params { publish_dir = "" publish_results = "all" } - 'samtools_index_mapped' { + 'merge_bam_mapping' { + suffix = "" + } + 'merge_bam_recalibrate' { + suffix = "md" + } + 'samtools_index_mapping' { args = "" suffix = "" - publish_dir = "Mapped" - publish_results = "all" + publish_dir_up = "Preprocessing" + publish_dir_down = "Mapped" + publish_results = "none" } - 'samtools_index_recal' { + 'samtools_index_recalibrate' { args = "" - suffix = "recal" - publish_dir = "" + suffix = "md" + publish_dir_up = "Preprocessing" + publish_dir_down = "Recalibrated" publish_results = "all" } 'gatk_markduplicates' { From 5605fa8da41f476a48f39e92784f14df940aabe9 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 5 Oct 2020 19:46:33 +0200 Subject: [PATCH 144/200] use modules.config --- modules/local/process/merge_bam.nf | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/modules/local/process/merge_bam.nf b/modules/local/process/merge_bam.nf index 5f54aae475..429f49d2a1 100644 --- a/modules/local/process/merge_bam.nf +++ b/modules/local/process/merge_bam.nf @@ -9,13 +9,15 @@ process MERGE_BAM { input: tuple val(meta), path(bam) + val options output: - tuple val(meta), path("${meta.sample}.bam"), emit: bam - val meta, emit: tsv + tuple val(meta), path("${name}.bam"), emit: bam + val meta, emit: tsv script: + name = options.suffix ? "${meta.id}.${options.suffix}" : "${meta.id}" """ - samtools merge --threads ${task.cpus} ${meta.sample}.bam ${bam} + samtools merge --threads ${task.cpus} ${name}.bam ${bam} """ } \ No newline at end of file From aec9bc50ef77a04ee6bf960eaff1a4cef65e0ed4 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 5 Oct 2020 19:47:10 +0200 Subject: [PATCH 145/200] add comments on take inputs --- modules/local/subworkflow/build_indices.nf | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/modules/local/subworkflow/build_indices.nf b/modules/local/subworkflow/build_indices.nf index 17874082e2..05b2620dee 100644 --- a/modules/local/subworkflow/build_indices.nf +++ b/modules/local/subworkflow/build_indices.nf @@ -19,13 +19,13 @@ include { SAMTOOLS_FAIDX } from '../../nf-core/softw workflow BUILD_INDICES{ take: - dbsnp - fasta - germline_resource - known_indels - pon - step - tools + dbsnp // channel: [optional] dbsnp + fasta // channel: [mandatory] fasta + germline_resource // channel: [optional] germline_resource + known_indels // channel: [optional] known_indels + pon // channel: [optional] pon + step // value: [mandatory] starting step + tools // list: [optional] tools to run main: From 8bd3179880cc2eb23f77e3bb2e424d7d070f4c69 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 5 Oct 2020 19:47:43 +0200 Subject: [PATCH 146/200] improve mapping subworkflow --- modules/local/subworkflow/mapping.nf | 191 ++++++++++++++++----------- 1 file changed, 111 insertions(+), 80 deletions(-) diff --git a/modules/local/subworkflow/mapping.nf b/modules/local/subworkflow/mapping.nf index dce167221c..bb45393540 100644 --- a/modules/local/subworkflow/mapping.nf +++ b/modules/local/subworkflow/mapping.nf @@ -4,101 +4,132 @@ ================================================================================ */ -include { BWA_MEM as BWAMEM1_MEM } from '../process/bwa_mem' -include { BWAMEM2_MEM } from '../process/bwamem2_mem' -include { MERGE_BAM as MERGE_BAM_MAPPED } from '../process/merge_bam' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MAPPED } from '../../nf-core/software/samtools/index' +include { BWAMEM2_MEM } from '../process/bwamem2_mem' +include { BWA_MEM as BWAMEM1_MEM } from '../process/bwa_mem' +include { MERGE_BAM } from '../process/merge_bam' +include { QUALIMAP_BAMQC } from '../../nf-core/software/qualimap_bamqc' +include { SAMTOOLS_INDEX } from '../../nf-core/software/samtools/index' +include { SAMTOOLS_STATS } from '../../nf-core/software/samtools/stats' workflow MAPPING { take: - reads_input - bwa - fasta - fai + step // value: [mandatory] starting step + input_sample // channel: [mandatory] input_sample + target_bed // channel: [optional] target_bed + bwa // channel: [mandatory] bwa + fasta // channel: [mandatory] fasta + fai // channel: [mandatory] fai + samtools_opts // map: options for SAMTOOLS_INDEX module + merge_bam_opts // map: options for MERGE_BAM module + skip_bamqc // boolean: true/false + skip_samtools // boolean: true/false main: - bam_bwamem1 = Channel.empty() - bam_bwamem2 = Channel.empty() + bam_mapped_indexed = Channel.empty() + bam_reports = Channel.empty() - if (params.aligner == "bwa-mem") { - BWAMEM1_MEM(reads_input, bwa, fasta, fai, params.modules['bwa_mem']) - bam_bwamem1 = BWAMEM1_MEM.out.bam - } else { - BWAMEM2_MEM(reads_input, bwa, fasta, fai, params.modules['bwamem2_mem']) - bam_bwamem2 = BWAMEM2_MEM.out - } + if (step == "mapping") { + reads_input = input_sample - bam_bwa = bam_bwamem1.mix(bam_bwamem2) - - bam_bwa.map{ meta, bam -> - patient = meta.patient - sample = meta.sample - gender = meta.gender - status = meta.status - [patient, sample, gender, status, bam] - }.groupTuple(by: [0,1]) - .branch{ - single: it[4].size() == 1 - multiple: it[4].size() > 1 - }.set{ bam_bwa_to_sort } - - bam_bwa_single = bam_bwa_to_sort.single.map { - patient, sample, gender, status, bam -> - - def meta = [:] - meta.patient = patient - meta.sample = sample - meta.gender = gender[0] - meta.status = status[0] - meta.id = sample - - [meta, bam[0]] - } + bam_bwamem1 = Channel.empty() + bam_bwamem2 = Channel.empty() - bam_bwa_multiple = bam_bwa_to_sort.multiple.map { - patient, sample, gender, status, bam -> + if (params.aligner == "bwa-mem") { + BWAMEM1_MEM(reads_input, bwa, fasta, fai, params.modules['bwa_mem']) + bam_bwamem1 = BWAMEM1_MEM.out.bam + } else { + BWAMEM2_MEM(reads_input, bwa, fasta, fai, params.modules['bwamem2_mem']) + bam_bwamem2 = BWAMEM2_MEM.out + } - def meta = [:] - meta.patient = patient - meta.sample = sample - meta.gender = gender[0] - meta.status = status[0] - meta.id = sample + bam_bwa = bam_bwamem1.mix(bam_bwamem2) + + bam_bwa.map{ meta, bam -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + [patient, sample, gender, status, bam] + }.groupTuple(by: [0,1]) + .branch{ + single: it[4].size() == 1 + multiple: it[4].size() > 1 + }.set{ bam_bwa_to_sort } + + bam_bwa_single = bam_bwa_to_sort.single.map { + patient, sample, gender, status, bam -> + + def meta = [:] + meta.patient = patient + meta.sample = sample + meta.gender = gender[0] + meta.status = status[0] + meta.id = sample + + [meta, bam[0]] + } - [meta, bam] - } + bam_bwa_multiple = bam_bwa_to_sort.multiple.map { + patient, sample, gender, status, bam -> + + def meta = [:] + meta.patient = patient + meta.sample = sample + meta.gender = gender[0] + meta.status = status[0] + meta.id = sample - // STEP 1.5: MERGING AND INDEXING BAM FROM MULTIPLE LANES - - MERGE_BAM_MAPPED(bam_bwa_multiple) - bam_mapped = bam_bwa_single.mix(MERGE_BAM_MAPPED.out.bam) - bam_mapped = SAMTOOLS_INDEX_MAPPED(bam_mapped, params.modules['samtools_index_mapped']) - - if (params.save_bam_mapped) { - tsv_bam_mapped = bam_mapped.map { meta, bam, bai -> [meta] } - // Creating TSV files to restart from this step - tsv_bam_mapped.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> - patient = meta.patient[0] - sample = meta.sample[0] - gender = meta.gender[0] - status = meta.status[0] - bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam" - bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam.bai" - ["mapped_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n"] + [meta, bam] } - tsv_bam_mapped.map { meta -> - patient = meta.patient[0] - sample = meta.sample[0] - gender = meta.gender[0] - status = meta.status[0] - bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam" - bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam.bai" - "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n" - }.collectFile(name: "mapped.tsv", sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + // STEP 1.5: MERGING AND INDEXING BAM FROM MULTIPLE LANES + + MERGE_BAM(bam_bwa_multiple, merge_bam_opts) + bam_mapped = bam_bwa_single.mix(MERGE_BAM.out.bam) + bam_mapped_indexed = SAMTOOLS_INDEX(bam_mapped, samtools_opts) + + qualimap_bamqc = Channel.empty() + samtools_stats = Channel.empty() + + if (!skip_bamqc) { + QUALIMAP_BAMQC(bam_mapped, target_bed) + qualimap_bamqc = QUALIMAP_BAMQC.out + } + + if (!skip_samtools) { + SAMTOOLS_STATS(bam_mapped) + samtools_stats = SAMTOOLS_STATS.out + } + + bam_reports = samtools_stats.mix(qualimap_bamqc) + + if (params.save_bam_mapped) { + tsv_bam_mapped = bam_mapped.map { meta, bam, bai -> [meta] } + // Creating TSV files to restart from this step + tsv_bam_mapped.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> + patient = meta.patient[0] + sample = meta.sample[0] + gender = meta.gender[0] + status = meta.status[0] + bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam" + bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam.bai" + ["mapped_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n"] + } + + tsv_bam_mapped.map { meta -> + patient = meta.patient[0] + sample = meta.sample[0] + gender = meta.gender[0] + status = meta.status[0] + bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam" + bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam.bai" + "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n" + }.collectFile(name: "mapped.tsv", sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + } } emit: - bam_mapped = bam_mapped + bam = bam_mapped_indexed + qc = bam_reports } From 1d3ef0f76ca433fd8da7f0cf120d29f1793b754f Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 5 Oct 2020 19:48:10 +0200 Subject: [PATCH 147/200] improve markduplicates subworkflow --- modules/local/subworkflow/markduplicates.nf | 103 ++++++++++---------- 1 file changed, 53 insertions(+), 50 deletions(-) diff --git a/modules/local/subworkflow/markduplicates.nf b/modules/local/subworkflow/markduplicates.nf index df423df402..eff27f2123 100644 --- a/modules/local/subworkflow/markduplicates.nf +++ b/modules/local/subworkflow/markduplicates.nf @@ -8,66 +8,69 @@ include { GATK_MARKDUPLICATES } from '../../nf-core/software/gatk/markduplicates workflow MARKDUPLICATES { take: - bam_mapped + step // value: [mandatory] starting step + bam_mapped // channel: [mandatory] bam_mapped main: bam_markduplicates = bam_mapped report_markduplicates = Channel.empty() - if (!params.skip_markduplicates) { - GATK_MARKDUPLICATES(bam_mapped) - report_markduplicates = GATK_MARKDUPLICATES.out.report - bam_markduplicates = GATK_MARKDUPLICATES.out.bam - tsv_markduplicates = GATK_MARKDUPLICATES.out.tsv + if (step == "mapping") { + if (!params.skip_markduplicates) { + GATK_MARKDUPLICATES(bam_mapped) + report_markduplicates = GATK_MARKDUPLICATES.out.report + bam_markduplicates = GATK_MARKDUPLICATES.out.bam + tsv_markduplicates = GATK_MARKDUPLICATES.out.tsv - // Creating TSV files to restart from this step - tsv_markduplicates.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> - patient = meta.patient - sample = meta.sample - gender = meta.gender - status = meta.status - bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" - bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" - table = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.recal.table" - ["duplicates_marked_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n"] - } + // Creating TSV files to restart from this step + tsv_markduplicates.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" + bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" + table = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.recal.table" + ["duplicates_marked_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n"] + } - tsv_markduplicates.map { meta -> - patient = meta.patient - sample = meta.sample - gender = meta.gender - status = meta.status - bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" - bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" - table = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.recal.table" - "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n" - }.collectFile(name: 'duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") - } else { - tsv_no_markduplicates = bam_markduplicates.map { meta, bam, bai -> [meta] } + tsv_markduplicates.map { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" + bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" + table = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.recal.table" + "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n" + }.collectFile(name: 'duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + } else { + tsv_no_markduplicates = bam_markduplicates.map { meta, bam, bai -> [meta] } - // Creating TSV files to restart from this step - tsv_no_markduplicates.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> - patient = meta.patient[0] - sample = meta.sample[0] - gender = meta.gender[0] - status = meta.status[0] - bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam" - bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam.bai" - table = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.recal.table" - ["mapped_no_duplicates_marked_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n"] - } + // Creating TSV files to restart from this step + tsv_no_markduplicates.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> + patient = meta.patient[0] + sample = meta.sample[0] + gender = meta.gender[0] + status = meta.status[0] + bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam" + bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam.bai" + table = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.recal.table" + ["mapped_no_duplicates_marked_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n"] + } - tsv_no_markduplicates.map { meta -> - patient = meta.patient[0] - sample = meta.sample[0] - gender = meta.gender[0] - status = meta.status[0] - bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam" - bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam.bai" - table = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.recal.table" - "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n" - }.collectFile(name: 'mapped_no_duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + tsv_no_markduplicates.map { meta -> + patient = meta.patient[0] + sample = meta.sample[0] + gender = meta.gender[0] + status = meta.status[0] + bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam" + bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam.bai" + table = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.recal.table" + "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n" + }.collectFile(name: 'mapped_no_duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + } } emit: From cb1041c7be1d3bacb8f9091a078dde7c1a38ddde Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 5 Oct 2020 19:48:38 +0200 Subject: [PATCH 148/200] improve prepare_recalibration subworkflow --- .../subworkflow/prepare_recalibration.nf | 115 ++++++++++-------- 1 file changed, 61 insertions(+), 54 deletions(-) diff --git a/modules/local/subworkflow/prepare_recalibration.nf b/modules/local/subworkflow/prepare_recalibration.nf index d2c90b93cf..a028f3f8f5 100644 --- a/modules/local/subworkflow/prepare_recalibration.nf +++ b/modules/local/subworkflow/prepare_recalibration.nf @@ -9,72 +9,79 @@ include { GATK_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from '../../nf-core/soft workflow PREPARE_RECALIBRATION { take: - bam_markduplicates - intervals - dbsnp - dbsnp_tbi - dict - fai - fasta - known_indels - known_indels_tbi + step // value: [mandatory] starting step + bam_markduplicates // channel: [mandatory] bam_markduplicates + intervals // channel: [mandatory] intervals + dbsnp // channel: [optional] dbsnp + dbsnp_tbi // channel: [optional] dbsnp_tbi + dict // channel: [mandatory] dict + fai // channel: [mandatory] fai + fasta // channel: [mandatory] fasta + known_indels // channel: [optional] known_indels + known_indels_tbi // channel: [optional] known_indels_tbi main: bam_baserecalibrator = bam_markduplicates.combine(intervals) - BASERECALIBRATOR(bam_baserecalibrator, dbsnp, dbsnp_tbi, dict, fai, fasta, known_indels, known_indels_tbi) - table_bqsr = BASERECALIBRATOR.out.report - tsv_bqsr = BASERECALIBRATOR.out.tsv + table_bqsr = Channel.empty() + tsv_bqsr = Channel.empty() - // STEP 3.5: MERGING RECALIBRATION TABLES - if (!params.no_intervals) { - BASERECALIBRATOR.out.report.map{ meta, table -> - patient = meta.patient - sample = meta.sample - gender = meta.gender - status = meta.status - [patient, sample, gender, status, table] - }.groupTuple(by: [0,1]).set{ recaltable } + if (step in ["mapping", "preparerecalibration"]) { - recaltable = recaltable.map { - patient, sample, gender, status, recal -> + BASERECALIBRATOR(bam_baserecalibrator, dbsnp, dbsnp_tbi, dict, fai, fasta, known_indels, known_indels_tbi) + table_bqsr = BASERECALIBRATOR.out.report + tsv_bqsr = BASERECALIBRATOR.out.tsv - def meta = [:] - meta.patient = patient - meta.sample = sample - meta.gender = gender[0] - meta.status = status[0] - meta.id = sample + // STEP 3.5: MERGING RECALIBRATION TABLES + if (!params.no_intervals) { + BASERECALIBRATOR.out.report.map{ meta, table -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + [patient, sample, gender, status, table] + }.groupTuple(by: [0,1]).set{ recaltable } - [meta, recal] - } + recaltable = recaltable.map { + patient, sample, gender, status, recal -> - GATHERBQSRREPORTS(recaltable) - table_bqsr = GATHERBQSRREPORTS.out.table - tsv_bqsr = GATHERBQSRREPORTS.out.tsv + def meta = [:] + meta.patient = patient + meta.sample = sample + meta.gender = gender[0] + meta.status = status[0] + meta.id = sample - } + [meta, recal] + } - // Creating TSV files to restart from this step - tsv_bqsr.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> - patient = meta.patient - sample = meta.sample - gender = meta.gender - status = meta.status - bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" - bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" - ["duplicates_marked_no_table_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n"] - } + GATHERBQSRREPORTS(recaltable) + table_bqsr = GATHERBQSRREPORTS.out.table + tsv_bqsr = GATHERBQSRREPORTS.out.tsv + + } + + // Creating TSV files to restart from this step + tsv_bqsr.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" + bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" + ["duplicates_marked_no_table_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n"] + } - tsv_bqsr.map { meta -> - patient = meta.patient - sample = meta.sample - gender = meta.gender - status = meta.status - bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" - bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" - "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n" - }.collectFile(name: 'duplicates_marked_no_table.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + tsv_bqsr.map { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" + bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" + "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n" + }.collectFile(name: 'duplicates_marked_no_table.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + } emit: table_bqsr = table_bqsr From 298a4994e483c338acda3341caf3e78a4fa70f05 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 5 Oct 2020 19:49:11 +0200 Subject: [PATCH 149/200] use container and conda directive --- modules/nf-core/software/qualimap_bamqc.nf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modules/nf-core/software/qualimap_bamqc.nf b/modules/nf-core/software/qualimap_bamqc.nf index 233add760f..76df692d2d 100644 --- a/modules/nf-core/software/qualimap_bamqc.nf +++ b/modules/nf-core/software/qualimap_bamqc.nf @@ -6,6 +6,10 @@ process QUALIMAP_BAMQC { publishDir "${params.outdir}/Reports/${meta.id}/bamQC", mode: params.publish_dir_mode + container "quay.io/biocontainers/qualimap:2.2.2d--1" + + conda (params.conda ? "bioconda::qualimap=2.2.2d" : null) + input: tuple val(meta), path(bam) path(target_bed) From ecc09129d052d4c436a3fffff08bc69d87a0e7fd Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 5 Oct 2020 19:49:43 +0200 Subject: [PATCH 150/200] improve reusability of module --- modules/nf-core/software/samtools/index.nf | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/modules/nf-core/software/samtools/index.nf b/modules/nf-core/software/samtools/index.nf index ed832c0205..9dcbe5c524 100644 --- a/modules/nf-core/software/samtools/index.nf +++ b/modules/nf-core/software/samtools/index.nf @@ -4,10 +4,10 @@ process SAMTOOLS_INDEX { tag "${meta.id}" publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { - if (params.save_bam_mapped) "Preprocessing/${meta.sample}/Mapped/${it}" - else null - } + saveAs: { filename -> + if (options.publish_results == "none") null + else if (filename.endsWith('.version.txt')) null + else "${options.publish_dir_up}/${meta.sample}/${options.publish_dir_down}/${filename}" } container "quay.io/biocontainers/samtools:1.10--h2e538c0_3" @@ -18,13 +18,13 @@ process SAMTOOLS_INDEX { val options output: - tuple val(meta), path("${prefix}.bam"), path("*.bai") + tuple val(meta), path("${name}.bam"), path("*.bai") script: - prefix = options.suffix ? "${meta.id}.${options.suffix}" : "${meta.id}" + name = options.suffix ? "${meta.id}.${options.suffix}" : "${meta.id}" """ - [ ! -f ${prefix}.bam ] && ln -s ${bam} ${prefix}.bam + [ ! -f ${name}.bam ] && ln -s ${bam} ${name}.bam - samtools index ${prefix}.bam + samtools index ${name}.bam """ } \ No newline at end of file From 224c911bf8197bc5aabf7d8734e12d9918c87906 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 5 Oct 2020 19:50:07 +0200 Subject: [PATCH 151/200] add: RECALIBRATE as a module --- modules/local/subworkflow/recalibrate.nf | 121 +++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 modules/local/subworkflow/recalibrate.nf diff --git a/modules/local/subworkflow/recalibrate.nf b/modules/local/subworkflow/recalibrate.nf new file mode 100644 index 0000000000..fefd54c20c --- /dev/null +++ b/modules/local/subworkflow/recalibrate.nf @@ -0,0 +1,121 @@ +/* +================================================================================ + PREPARE RECALIBRATION +================================================================================ +*/ + +include { GATK_APPLYBQSR as APPLYBQSR } from '../../nf-core/software/gatk/applybqsr' +include { MERGE_BAM } from '../process/merge_bam' +include { SAMTOOLS_STATS } from '../../nf-core/software/samtools/stats' +include { SAMTOOLS_INDEX } from '../../nf-core/software/samtools/index' +include { QUALIMAP_BAMQC } from '../../nf-core/software/qualimap_bamqc' + +workflow RECALIBRATE { + take: + step // value: [mandatory] starting step + bam_applybqsr // channel: [mandatory] bam_applybqsr + intervals // channel: [mandatory] intervals + target_bed // channel: [optional] target_bed + dict // channel: [mandatory] dict + fasta // channel: [mandatory] fasta + fai // channel: [mandatory] fai + samtools_opts // map: options for SAMTOOLS_INDEX module + merge_bam_opts // map: options for MERGE_BAM module + skip_bamqc // boolean: true/false + skip_samtools // boolean: true/false + + main: + + bam_recalibrated_indexed = Channel.empty() + bam_recalibrated = Channel.empty() + bam_reports = Channel.empty() + + if (step in ["mapping", "preparerecalibration", "recalibrate"]) { + + bam_applybqsr = bam_applybqsr.combine(intervals) + + APPLYBQSR(bam_applybqsr, dict, fasta, fai) + + // STEP 4.5: MERGING AND INDEXING THE RECALIBRATED BAM FILES + if (params.no_intervals) { + bam_recalibrated = APPLYBQSR.out.bam + tsv_recalibrated = APPLYBQSR.out.tsv + } else { + APPLYBQSR.out.bam.map{ meta, bam -> //, bai -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + [patient, sample, gender, status, bam] //, bai] + }.groupTuple(by: [0,1]).set{ bam_recal_to_merge } + + bam_recal_to_merge = bam_recal_to_merge.map { + patient, sample, gender, status, bam -> //, bai -> + + def meta = [:] + meta.patient = patient + meta.sample = sample + meta.gender = gender[0] + meta.status = status[0] + meta.id = sample + + [meta, bam] + } + + MERGE_BAM(bam_recal_to_merge, merge_bam_opts) + bam_recalibrated = MERGE_BAM.out.bam + tsv_recalibrated = MERGE_BAM.out.tsv + } + + bam_recalibrated_indexed = SAMTOOLS_INDEX(bam_recalibrated, samtools_opts) + + qualimap_bamqc = Channel.empty() + samtools_stats = Channel.empty() + + if (!skip_bamqc) { + QUALIMAP_BAMQC(bam_recalibrated, target_bed) + qualimap_bamqc = QUALIMAP_BAMQC.out + } + + if (!skip_samtools) { + SAMTOOLS_STATS(bam_recalibrated) + samtools_stats = SAMTOOLS_STATS.out + } + + bam_reports = samtools_stats.mix(qualimap_bamqc) + + //TODO: set bam_recalibrated with all these steps + // // When using sentieon for mapping, Channel bam_recalibrated is bam_sentieon_recal + // if (params.sentieon && step == 'mapping') bam_recalibrated = bam_sentieon_recal + + // // When no knownIndels for mapping, Channel bam_recalibrated is bam_duplicates_marked + // if (!params.known_indels && step == 'mapping') bam_recalibrated = bam_duplicates_marked + + // // When starting with variant calling, Channel bam_recalibrated is input_sample + // if (step == 'variantcalling') bam_recalibrated = input_sample + // Creating TSV files to restart from this step + tsv_recalibrated.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/Preprocessing/${sample}/Recalibrated/${sample}.md.bam" + bai = "${params.outdir}/Preprocessing/${sample}/Recalibrated/${sample}.md.bam.bai" + ["recalibrated_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n"] + } + + tsv_recalibrated.map { meta -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + bam = "${params.outdir}/Preprocessing/${sample}/Recalibrated/${sample}.md.bam" + bai = "${params.outdir}/Preprocessing/${sample}/Recalibrated/${sample}.md.bam.bai" + "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n" + }.collectFile(name: 'recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + } + + emit: + bam = bam_recalibrated + qc = bam_reports +} From 55eaf7623effa296a1f0930116f350dfdcfd0fe3 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 5 Oct 2020 19:50:41 +0200 Subject: [PATCH 152/200] add: RECALIBRATE as a module + code improvment --- main.nf | 155 ++++++++++++++++++++++++-------------------------------- 1 file changed, 65 insertions(+), 90 deletions(-) diff --git a/main.nf b/main.nf index 960c4aed6f..568092bd1d 100644 --- a/main.nf +++ b/main.nf @@ -174,6 +174,8 @@ if (tsv_path) { ================================================================================ */ +modules = params.modules + // Initialize each params in params.genomes, catch the command line first if it was defined params.ac_loci = params.genome ? params.genomes[params.genome].ac_loci ?: false : false params.ac_loci_gc = params.genome ? params.genomes[params.genome].ac_loci_gc ?: false : false @@ -271,6 +273,7 @@ include { BUILD_INDICES } from './modules/local/subworkflow/build_indice include { MAPPING } from './modules/local/subworkflow/mapping' include { MARKDUPLICATES } from './modules/local/subworkflow/markduplicates' include { PREPARE_RECALIBRATION } from './modules/local/subworkflow/prepare_recalibration' +include { RECALIBRATE } from './modules/local/subworkflow/recalibrate' /* ================================================================================ @@ -278,12 +281,6 @@ include { PREPARE_RECALIBRATION } from './modules/local/subworkflow/prepare_reca ================================================================================ */ -include { GATK_BASERECALIBRATOR as BASERECALIBRATOR } from './modules/nf-core/software/gatk/baserecalibrator' -include { GATK_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from './modules/nf-core/software/gatk/gatherbqsrreports' -include { GATK_APPLYBQSR as APPLYBQSR } from './modules/nf-core/software/gatk/applybqsr' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_RECAL } from './modules/nf-core/software/samtools/index' -include { SAMTOOLS_STATS as SAMTOOLS_STATS } from './modules/nf-core/software/samtools/stats' -include { QUALIMAP_BAMQC as BAMQC } from './modules/nf-core/software/qualimap_bamqc' include { GATK_HAPLOTYPECALLER as HAPLOTYPECALLER } from './modules/nf-core/software/gatk/haplotypecaller' include { GATK_GENOTYPEVCF as GENOTYPEVCF } from './modules/nf-core/software/gatk/genotypegvcf' include { STRELKA as STRELKA } from './modules/nf-core/software/strelka' @@ -372,21 +369,24 @@ workflow { ================================================================================ */ - qc_reports = Channel.empty() - bam_mapped = Channel.empty() - - if (step == 'mapping') input_reads = input_sample - else input_reads = Channel.empty() + bam_mapped = Channel.empty() + bam_mapped_qc = Channel.empty() + bam_recalibrated_qc = Channel.empty() + input_reads = Channel.empty() + qc_reports = Channel.empty() - // STEP 0.5: QC & TRIM IF SPECIFIED ON READS + // STEP 0: QC & TRIM + // --skip_qc fastqc to skip fastqc + // trim only run when --trim_fastq is specified + // and have the corresponding options set up QC_TRIM( - input_reads, - ('fastqc' in skip_qc), + input_sample, + ('fastqc' in skip_qc || step != "mapping"), !(params.trim_fastq), - params.modules['fastqc'], - params.modules['trimgalore'] - ) + modules['fastqc'], + modules['trimgalore']) + reads_input = QC_TRIM.out.reads qc_reports = qc_reports.mix( @@ -396,20 +396,32 @@ workflow { QC_TRIM.out.trimgalore_log, QC_TRIM.out.trimgalore_zip) - // STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA MEM + // STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA-MEM + + if (params.save_bam_mapped) modules['samtools_index_mapping']['publish_results'] = "all" MAPPING( + step, reads_input, + target_bed, bwa, fasta, - fai - ) + fai, + modules['samtools_index_mapping'], + modules['merge_bam_mapping'], + ('bamqc' in skip_qc), + ('samtools' in skip_qc)) + + bam_mapped = MAPPING.out.bam + bam_mapped_qc = MAPPING.out.qc - bam_mapped = MAPPING.out.bam_mapped + qc_reports = qc_reports.mix(bam_mapped_qc) // STEP 2: MARKING DUPLICATES - MARKDUPLICATES(bam_mapped) + MARKDUPLICATES( + step, + bam_mapped) bam_markduplicates = MARKDUPLICATES.out.bam @@ -417,7 +429,17 @@ workflow { // STEP 3: CREATING RECALIBRATION TABLES - PREPARE_RECALIBRATION(bam_markduplicates, intervals, dbsnp, dbsnp_tbi, dict, fai, fasta, known_indels, known_indels_tbi) + PREPARE_RECALIBRATION( + step, + bam_markduplicates, + intervals, + dbsnp, + dbsnp_tbi, + dict, + fai, + fasta, + known_indels, + known_indels_tbi) table_bqsr = PREPARE_RECALIBRATION.out.table_bqsr @@ -426,83 +448,36 @@ workflow { if (step == 'recalibrate') bam_applybqsr = input_sample - bam_applybqsr = bam_applybqsr.combine(intervals) - - APPLYBQSR(bam_applybqsr, dict, fasta, fai) - - bam_recalibrated = APPLYBQSR.out.bam - tsv_recalibrated = APPLYBQSR.out.tsv - - // STEP 4.5: MERGING AND INDEXING THE RECALIBRATED BAM FILES - if (!params.no_intervals) { - APPLYBQSR.out.bam.map{ meta, bam -> //, bai -> - patient = meta.patient - sample = meta.sample - gender = meta.gender - status = meta.status - [patient, sample, gender, status, bam] //, bai] - }.groupTuple(by: [0,1]).set{ bam_recal_to_merge } - - bam_recal_to_merge = bam_recal_to_merge.map { - patient, sample, gender, status, bam -> //, bai -> - - def meta = [:] - meta.patient = patient - meta.sample = sample - meta.gender = gender[0] - meta.status = status[0] - meta.id = sample + RECALIBRATE( + step, + bam_applybqsr, + intervals, + target_bed, + dict, + fasta, + fai, + modules['samtools_index_recalibrate'], + modules['merge_bam_recalibrate'], + ('bamqc' in skip_qc), + ('samtools' in skip_qc)) - [meta, bam] - } + bam_recalibrated = RECALIBRATE.out.bam + bam_recalibrated_qc = RECALIBRATE.out.qc - MERGE_BAM_RECAL(bam_recal_to_merge) - bam_recalibrated = MERGE_BAM_RECAL.out.bam - tsv_recalibrated = MERGE_BAM_RECAL.out.tsv - } - //TODO: set bam_recalibrated with all these steps - // // When using sentieon for mapping, Channel bam_recalibrated is bam_sentieon_recal - // if (params.sentieon && step == 'mapping') bam_recalibrated = bam_sentieon_recal - - // // When no knownIndels for mapping, Channel bam_recalibrated is bam_duplicates_marked - // if (!params.known_indels && step == 'mapping') bam_recalibrated = bam_duplicates_marked - - // // When starting with variant calling, Channel bam_recalibrated is input_sample - // if (step == 'variantcalling') bam_recalibrated = input_sample - // Creating TSV files to restart from this step - tsv_recalibrated.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> - patient = meta.patient - sample = meta.sample - gender = meta.gender - status = meta.status - bam = "${params.outdir}/Preprocessing/${sample}/Recalibrated/${sample}.md.bam" - bai = "${params.outdir}/Preprocessing/${sample}/Recalibrated/${sample}.md.bam.bai" - ["recalibrated_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n"] - } + qc_reports = qc_reports.mix(bam_recalibrated_qc) - tsv_recalibrated.map { meta -> - patient = meta.patient - sample = meta.sample - gender = meta.gender - status = meta.status - bam = "${params.outdir}/Preprocessing/${sample}/Recalibrated/${sample}.md.bam" - bai = "${params.outdir}/Preprocessing/${sample}/Recalibrated/${sample}.md.bam.bai" - "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n" - }.collectFile(name: 'recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + if (step == 'variantcalling') bam_variant_calling = input_sample - // STEP 5: QC - if (!'samtools' in skip_qc) SAMTOOLS_STATS(bam_bwa.mix(recal)) - if (!'bamqc' in skip_qc) BAMQC(bam_bwa.mix(recal), target_bed) + bam_variant_calling = bam_recalibrated /* ================================================================================ GERMLINE VARIANT CALLING ================================================================================ */ - //TODO double check whether the indexing has to be repeated here. there is a bai file somewhere up at ApplyBQSR - bam_recalibrated_indexed_variant_calling = SAMTOOLS_INDEX_RECAL(bam_recalibrated, params.modules['samtools_index_mapped'],) + if ('haplotypecaller' in tools){ - bam_haplotypecaller = bam_recalibrated_indexed_variant_calling.combine(intervals) + bam_haplotypecaller = bam_variant_calling.combine(intervals) // STEP GATK HAPLOTYPECALLER.1 @@ -531,7 +506,7 @@ workflow { } if ('strelka' in tools) { - STRELKA(bam_recalibrated_indexed_variant_calling, fasta, fai, target_bed, params.modules['strelka']) + STRELKA(bam_variant_calling, fasta, fai, target_bed, modules['strelka']) } /* From 271b8dd17c0ad8b8314404a084dc911c3ab54f64 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 5 Oct 2020 20:30:17 +0200 Subject: [PATCH 153/200] fix: correct usage of --save_bam_mapped --- main.nf | 5 +++-- modules/local/subworkflow/mapping.nf | 25 +++++++++++++------------ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/main.nf b/main.nf index 568092bd1d..5712c66778 100644 --- a/main.nf +++ b/main.nf @@ -398,7 +398,7 @@ workflow { // STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA-MEM - if (params.save_bam_mapped) modules['samtools_index_mapping']['publish_results'] = "all" + if (save_bam_mapped) modules['samtools_index_mapping'].publish_results = "all" MAPPING( step, @@ -410,7 +410,8 @@ workflow { modules['samtools_index_mapping'], modules['merge_bam_mapping'], ('bamqc' in skip_qc), - ('samtools' in skip_qc)) + ('samtools' in skip_qc), + save_bam_mapped) bam_mapped = MAPPING.out.bam bam_mapped_qc = MAPPING.out.qc diff --git a/modules/local/subworkflow/mapping.nf b/modules/local/subworkflow/mapping.nf index bb45393540..78ae8bdacc 100644 --- a/modules/local/subworkflow/mapping.nf +++ b/modules/local/subworkflow/mapping.nf @@ -13,16 +13,17 @@ include { SAMTOOLS_STATS } from '../../nf-core/software/samtools/stats' workflow MAPPING { take: - step // value: [mandatory] starting step - input_sample // channel: [mandatory] input_sample - target_bed // channel: [optional] target_bed - bwa // channel: [mandatory] bwa - fasta // channel: [mandatory] fasta - fai // channel: [mandatory] fai - samtools_opts // map: options for SAMTOOLS_INDEX module - merge_bam_opts // map: options for MERGE_BAM module - skip_bamqc // boolean: true/false - skip_samtools // boolean: true/false + step // value: [mandatory] starting step + input_sample // channel: [mandatory] input_sample + target_bed // channel: [optional] target_bed + bwa // channel: [mandatory] bwa + fasta // channel: [mandatory] fasta + fai // channel: [mandatory] fai + samtools_opts // map: options for SAMTOOLS_INDEX module + merge_bam_opts // map: options for MERGE_BAM module + skip_bamqc // boolean: true/false + skip_samtools // boolean: true/false + save_bam_mapped // boolean: true/false main: @@ -104,8 +105,8 @@ workflow MAPPING { bam_reports = samtools_stats.mix(qualimap_bamqc) - if (params.save_bam_mapped) { - tsv_bam_mapped = bam_mapped.map { meta, bam, bai -> [meta] } + if (save_bam_mapped) { + tsv_bam_mapped = bam_mapped.map { meta, bam -> [meta] } // Creating TSV files to restart from this step tsv_bam_mapped.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> patient = meta.patient[0] From 7ba719b7c37eae44df23b3d81b2ce280f1fc053e Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Mon, 5 Oct 2020 20:40:28 +0200 Subject: [PATCH 154/200] ci: split germline test runs --- .github/workflows/ci.yml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f414ec3425..be6e098f8a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -87,12 +87,14 @@ jobs: run: docker pull nfcore/sarek:dsl2 - name: Get test data run: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data - - name: Run germline test - run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input data/testdata/tiny/normal --save_bam_mapped - nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step prepare_recalibration -resume - nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step recalibrate -resume - nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step variantCalling + - name: Run germline test --step mapping + run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input data/testdata/tiny/normal --save_bam_mapped + - name: Run germline test --step prepare_recalibration + run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step prepare_recalibration -resume + - name: Run germline test --step recalibrate + run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step recalibrate -resume + - name: Run germline test --step variantCalling + run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step variantCalling # annotation: # env: From c735634a65326ef873aae327ad7f89f35c2fdaef Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 6 Oct 2020 14:31:26 +0200 Subject: [PATCH 155/200] code polishing --- .github/workflows/ci.yml | 1 - main.nf | 23 ++++++++++------------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index be6e098f8a..37efb5f6e8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -180,7 +180,6 @@ jobs: # intervals: --no_intervals # - tool: TIDDIT # intervals: --no_intervals - # steps: # - uses: actions/checkout@v2 # - name: Install Nextflow diff --git a/main.nf b/main.nf index 5712c66778..a79d4e4cfe 100644 --- a/main.nf +++ b/main.nf @@ -269,11 +269,11 @@ include { MERGE_BAM as MERGE_BAM_RECAL } from './modules/local/process/merge_ba ================================================================================ */ -include { BUILD_INDICES } from './modules/local/subworkflow/build_indices' -include { MAPPING } from './modules/local/subworkflow/mapping' -include { MARKDUPLICATES } from './modules/local/subworkflow/markduplicates' -include { PREPARE_RECALIBRATION } from './modules/local/subworkflow/prepare_recalibration' -include { RECALIBRATE } from './modules/local/subworkflow/recalibrate' +include { BUILD_INDICES } from './modules/local/subworkflow/build_indices' +include { MAPPING } from './modules/local/subworkflow/mapping' +include { MARKDUPLICATES } from './modules/local/subworkflow/markduplicates' +include { PREPARE_RECALIBRATION } from './modules/local/subworkflow/prepare_recalibration' +include { RECALIBRATE } from './modules/local/subworkflow/recalibrate' /* ================================================================================ @@ -281,10 +281,7 @@ include { RECALIBRATE } from './modules/local/subworkflow/recalibrate' ================================================================================ */ -include { GATK_HAPLOTYPECALLER as HAPLOTYPECALLER } from './modules/nf-core/software/gatk/haplotypecaller' -include { GATK_GENOTYPEVCF as GENOTYPEVCF } from './modules/nf-core/software/gatk/genotypegvcf' -include { STRELKA as STRELKA } from './modules/nf-core/software/strelka' -include { MULTIQC } from './modules/nf-core/software/multiqc' +include { MULTIQC } from './modules/nf-core/software/multiqc' /* ================================================================================ @@ -292,7 +289,7 @@ include { MULTIQC } from './modules/nf-c ================================================================================ */ -include { QC_TRIM } from './modules/nf-core/subworkflow/qc_trim' +include { QC_TRIM } from './modules/nf-core/subworkflow/qc_trim' // PREPARING CHANNELS FOR PREPROCESSING AND QC @@ -376,9 +373,9 @@ workflow { qc_reports = Channel.empty() // STEP 0: QC & TRIM - // --skip_qc fastqc to skip fastqc - // trim only run when --trim_fastq is specified - // and have the corresponding options set up + // `--skip_qc fastqc` to skip fastqc + // trim only with `--trim_fastq` + // addtional options to be set up QC_TRIM( input_sample, From 84e891ffc295a2ba7b663e6f9fa8f4dd2f231212 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 6 Oct 2020 14:32:01 +0200 Subject: [PATCH 156/200] add: mulled container for bwa-mem2 2.0 + samtools 1.10 --- environment.yml | 7 ------- modules/local/process/bwamem2_mem.nf | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/environment.yml b/environment.yml index 89afd25977..b05d4d3789 100644 --- a/environment.yml +++ b/environment.yml @@ -9,10 +9,3 @@ dependencies: - conda-forge::markdown=3.1.1 - conda-forge::pymdown-extensions=6.0 - conda-forge::pygments=2.5.2 - - bioconda::bwa-mem2=2.0 - - bioconda::ensembl-vep=99.2 - - bioconda::genesplicer=1.0 - - bioconda::samtools=1.10 - - bioconda::snpeff=4.3.1t - - conda-forge::pigz=2.3.4 - - conda-forge::r-ggplot2=3.3.0 diff --git a/modules/local/process/bwamem2_mem.nf b/modules/local/process/bwamem2_mem.nf index bcca32a2c3..db67534832 100644 --- a/modules/local/process/bwamem2_mem.nf +++ b/modules/local/process/bwamem2_mem.nf @@ -9,7 +9,7 @@ process BWAMEM2_MEM { if (filename.endsWith('.version.txt')) null else filename } - container "nfcore/sarek:dsl2" + container "quay.io/biocontainers/mulled-v2-e5d375990341c5aef3c9aff74f96f66f65375ef6:876eb6f1d38fbf578296ea94e5aede4e317939e7-0" conda (params.conda ? "bioconda::bwa-mem2=2.0 bioconda::samtools=1.10" : null) From d6513c0fce1132315d2dcacafc0e20142623b170 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 6 Oct 2020 15:21:24 +0200 Subject: [PATCH 157/200] code polishing --- main.nf | 36 ++++++++++--------- modules/local/process/output_documentation.nf | 22 ------------ modules/local/subworkflow/build_indices.nf | 13 +++---- modules/local/subworkflow/mapping.nf | 2 +- modules/local/subworkflow/markduplicates.nf | 2 +- .../subworkflow/prepare_recalibration.nf | 2 +- modules/local/subworkflow/recalibrate.nf | 2 +- 7 files changed, 30 insertions(+), 49 deletions(-) delete mode 100644 modules/local/process/output_documentation.nf diff --git a/main.nf b/main.nf index a79d4e4cfe..93b3cf841f 100644 --- a/main.nf +++ b/main.nf @@ -255,17 +255,15 @@ if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works /* ================================================================================ - INCLUDE LOCAL PIPELINE MODULES + INCLUDE LOCAL MODULES ================================================================================ */ include { GET_SOFTWARE_VERSIONS } from './modules/local/process/get_software_versions' -include { OUTPUT_DOCUMENTATION } from './modules/local/process/output_documentation' -include { MERGE_BAM as MERGE_BAM_RECAL } from './modules/local/process/merge_bam' /* ================================================================================ - INCLUDE LOCAL PIPELINE SUBWORKFLOWS + INCLUDE LOCAL SUBWORKFLOWS ================================================================================ */ @@ -277,7 +275,7 @@ include { RECALIBRATE } from './modules/local/subworkflow/reca /* ================================================================================ - INCLUDE nf-core PIPELINE MODULES + INCLUDE nf-core MODULES ================================================================================ */ @@ -285,7 +283,7 @@ include { MULTIQC } from './modules/nf-core/software/multi /* ================================================================================ - INCLUDE nf-core PIPELINE SUBWORKFLOWS + INCLUDE nf-core SUBWORKFLOWS ================================================================================ */ @@ -340,6 +338,12 @@ include { QC_TRIM } from './modules/nf-core/subworkflow/qc workflow { +/* +================================================================================ + BUILD INDICES +================================================================================ +*/ + BUILD_INDICES( dbsnp, fasta, @@ -347,7 +351,8 @@ workflow { known_indels, pon, step, - tools) + tools, + modules['bwa_index']) intervals = BUILD_INDICES.out.intervals @@ -360,11 +365,11 @@ workflow { known_indels_tbi = params.known_indels ? params.known_indels_index ? file(params.known_indels_index) : BUILD_INDICES.out.known_indels_tbi.collect() : file("${params.outdir}/no_file") pon_tbi = params.pon ? params.pon_index ? file(params.pon_index) : BUILD_INDICES.out.pon_tbi : file("${params.outdir}/no_file") - /* - ================================================================================ - PREPROCESSING - ================================================================================ - */ +/* +================================================================================ + PREPROCESSING +================================================================================ +*/ bam_mapped = Channel.empty() bam_mapped_qc = Channel.empty() @@ -464,10 +469,10 @@ workflow { qc_reports = qc_reports.mix(bam_recalibrated_qc) - if (step == 'variantcalling') bam_variant_calling = input_sample - bam_variant_calling = bam_recalibrated + if (step == 'variantcalling') bam_variant_calling = input_sample + /* ================================================================================ GERMLINE VARIANT CALLING @@ -525,9 +530,6 @@ workflow { MultiQC ================================================================================ */ - OUTPUT_DOCUMENTATION( - output_docs, - output_docs_images) GET_SOFTWARE_VERSIONS() diff --git a/modules/local/process/output_documentation.nf b/modules/local/process/output_documentation.nf deleted file mode 100644 index c9bc3f13eb..0000000000 --- a/modules/local/process/output_documentation.nf +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Output Markdown documentation to HTML - */ -process OUTPUT_DOCUMENTATION { - publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode - - container "nfcore/sarek:dsl2" - - conda (params.conda ? "$baseDir/environment.yml" : null) - - input: - path output_docs - path images - - output: - path "results_description.html" - - script: - """ - markdown_to_html.py ${output_docs} -o results_description.html - """ -} diff --git a/modules/local/subworkflow/build_indices.nf b/modules/local/subworkflow/build_indices.nf index 05b2620dee..e5919aca9f 100644 --- a/modules/local/subworkflow/build_indices.nf +++ b/modules/local/subworkflow/build_indices.nf @@ -1,10 +1,10 @@ /* ================================================================================ - BUILDING INDEXES + BUILDING INDICES ================================================================================ */ -// And then initialize channels based on params or indexes that were just built +// And then initialize channels based on params or indices that were just built include { BUILD_INTERVALS } from '../process/build_intervals.nf' include { BWA_INDEX } from '../../nf-core/software/bwa/index/main.nf' @@ -26,21 +26,22 @@ workflow BUILD_INDICES{ pon // channel: [optional] pon step // value: [mandatory] starting step tools // list: [optional] tools to run + bwa_index_opts // map: options for BWA_INDEX module main: result_bwa = Channel.empty() version_bwa = Channel.empty() - if (!(params.bwa) && params.fasta && 'mapping' in step) - if (params.aligner == "bwa-mem") (result_bwa, version_bwa) = BWA_INDEX(fasta, params.modules['bwa_index']) + if (!(params.bwa) && 'mapping' in step) + if (params.aligner == "bwa-mem") (result_bwa, version_bwa) = BWA_INDEX(fasta, bwa_index_opts) else result_bwa = BWAMEM2_INDEX(fasta) result_dict = Channel.empty() - if (!(params.dict) && params.fasta && !('annotate' in step) && !('controlfreec' in step)) + if (!(params.dict) && !('annotate' in step) && !('controlfreec' in step)) result_dict = GATK_DICT(fasta) result_fai = Channel.empty() - if (!(params.fasta_fai) && params.fasta && !('annotate' in step)) + if (!(params.fasta_fai) && !('annotate' in step)) result_fai = SAMTOOLS_FAIDX(fasta) result_dbsnp_tbi = Channel.empty() diff --git a/modules/local/subworkflow/mapping.nf b/modules/local/subworkflow/mapping.nf index 78ae8bdacc..6f3d1d5428 100644 --- a/modules/local/subworkflow/mapping.nf +++ b/modules/local/subworkflow/mapping.nf @@ -1,6 +1,6 @@ /* ================================================================================ - MAPPING + MAPPING ================================================================================ */ diff --git a/modules/local/subworkflow/markduplicates.nf b/modules/local/subworkflow/markduplicates.nf index eff27f2123..6ac36299ad 100644 --- a/modules/local/subworkflow/markduplicates.nf +++ b/modules/local/subworkflow/markduplicates.nf @@ -1,6 +1,6 @@ /* ================================================================================ - MARKDUPLICATES + MARKDUPLICATES ================================================================================ */ diff --git a/modules/local/subworkflow/prepare_recalibration.nf b/modules/local/subworkflow/prepare_recalibration.nf index a028f3f8f5..a6dec6dd2c 100644 --- a/modules/local/subworkflow/prepare_recalibration.nf +++ b/modules/local/subworkflow/prepare_recalibration.nf @@ -1,6 +1,6 @@ /* ================================================================================ - PREPARE RECALIBRATION + PREPARE RECALIBRATION ================================================================================ */ diff --git a/modules/local/subworkflow/recalibrate.nf b/modules/local/subworkflow/recalibrate.nf index fefd54c20c..ab2e28667e 100644 --- a/modules/local/subworkflow/recalibrate.nf +++ b/modules/local/subworkflow/recalibrate.nf @@ -1,6 +1,6 @@ /* ================================================================================ - PREPARE RECALIBRATION + RECALIBRATE ================================================================================ */ From e8ac7dc60a911d3711dfbf10ed7c0bd002fe01aa Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 6 Oct 2020 15:46:35 +0200 Subject: [PATCH 158/200] add: GERMLINE VARIANT CALLING as a module --- main.nf | 38 +++--------- .../subworkflow/germline_variant_calling.nf | 61 +++++++++++++++++++ modules/nf-core/software/gatk/genotypegvcf.nf | 2 +- 3 files changed, 72 insertions(+), 29 deletions(-) create mode 100644 modules/local/subworkflow/germline_variant_calling.nf diff --git a/main.nf b/main.nf index 93b3cf841f..4d83e145e2 100644 --- a/main.nf +++ b/main.nf @@ -272,6 +272,7 @@ include { MAPPING } from './modules/local/subworkflow/mapp include { MARKDUPLICATES } from './modules/local/subworkflow/markduplicates' include { PREPARE_RECALIBRATION } from './modules/local/subworkflow/prepare_recalibration' include { RECALIBRATE } from './modules/local/subworkflow/recalibrate' +include { GERMLINE_VARIANT_CALLING } from './modules/local/subworkflow/germline_variant_calling' /* ================================================================================ @@ -479,34 +480,15 @@ workflow { ================================================================================ */ - if ('haplotypecaller' in tools){ - bam_haplotypecaller = bam_variant_calling.combine(intervals) - - // STEP GATK HAPLOTYPECALLER.1 - - HAPLOTYPECALLER(bam_haplotypecaller, dbsnp, - dbsnp_tbi, - dict, - fasta, - fai) - - - // STEP GATK HAPLOTYPECALLER.2 - GENOTYPEVCF(HAPLOTYPECALLER.out.gvcfGenotypeGVCFs, dbsnp, - dbsnp_tbi, - dict, - fasta, - fai) - - GENOTYPEVCF.out.map{name, meta, vcf -> - patient = meta.patient - sample = meta.sample - gender = meta.gender - status = meta.status - [name, patient, sample, gender, status, vcf] - }.groupTuple(by: [0,1,2,]) - .set{ vcfGenotypeGVCFs } - } + GERMLINE_VARIANT_CALLING( + bam_variant_calling, + intervals, + tools, + target_bed, + dbsnp, + dbsnp_tbi, + fasta, + fai) if ('strelka' in tools) { STRELKA(bam_variant_calling, fasta, fai, target_bed, modules['strelka']) diff --git a/modules/local/subworkflow/germline_variant_calling.nf b/modules/local/subworkflow/germline_variant_calling.nf new file mode 100644 index 0000000000..2af4190afb --- /dev/null +++ b/modules/local/subworkflow/germline_variant_calling.nf @@ -0,0 +1,61 @@ +/* +================================================================================ + GERMLINE VARIANT CALLING +================================================================================ +*/ + +include { GATK_HAPLOTYPECALLER as HAPLOTYPECALLER } from '../../nf-core/software/gatk/haplotypecaller' +include { GATK_GENOTYPEGVCF as GENOTYPEGVCF } from '../../nf-core/software/gatk/genotypegvcf' + + +workflow GERMLINE_VARIANT_CALLING { + take: + bam_variant_calling // channel: [mandatory] bam + intervals // channel: [mandatory] intervals + tools // list: [mandatory] list of tools + target_bed // channel: [optional] target_bed + dbsnp // channel: [mandatory] dbsnp + dbsnp_tbi // channel: [mandatory] dbsnp_tbi + fasta // channel: [mandatory] fasta + fai // channel: [mandatory] fai + + main: + + vcfGenotypeGVCFs = Channel.empty() + + if ('haplotypecaller' in tools) { + bam_haplotypecaller = bam_variant_calling.combine(intervals) + + // STEP GATK HAPLOTYPECALLER.1 + + HAPLOTYPECALLER( + bam_haplotypecaller, + dbsnp, + dbsnp_tbi, + dict, + fasta, + fai) + + + // STEP GATK HAPLOTYPECALLER.2 + GENOTYPEGVCF( + HAPLOTYPECALLER.out.gvcfGenotypeGVCFs, + dbsnp, + dbsnp_tbi, + dict, + fasta, + fai) + + GENOTYPEGVCF.out.map{ name, meta, vcf -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + [name, patient, sample, gender, status, vcf] + }.groupTuple(by: [0,1,2,]) + .set{ vcfGenotypeGVCFs } + } + + emit: + vcfGenotypeGVCFs = vcfGenotypeGVCFs +} diff --git a/modules/nf-core/software/gatk/genotypegvcf.nf b/modules/nf-core/software/gatk/genotypegvcf.nf index 4c8f50cd16..07bff711f7 100644 --- a/modules/nf-core/software/gatk/genotypegvcf.nf +++ b/modules/nf-core/software/gatk/genotypegvcf.nf @@ -1,4 +1,4 @@ -process GATK_GENOTYPEVCF { +process GATK_GENOTYPEGVCF { tag "${meta.id}-${interval.baseName}" container "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" From 79ece09ff681ae871df236d1cc799c951a82a3bc Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 6 Oct 2020 17:19:37 +0200 Subject: [PATCH 159/200] add: HaplotypeCaller within GERMLINE_VARIANT_CALLING module --- .github/workflows/ci.yml | 61 ++++++++++--------- conf/base.config | 2 +- conf/modules.config | 18 ++++-- main.nf | 10 +-- .../subworkflow/germline_variant_calling.nf | 54 ++++++++++++++-- modules/nf-core/software/gatk/genotypegvcf.nf | 2 +- .../nf-core/software/gatk/haplotypecaller.nf | 4 +- 7 files changed, 102 insertions(+), 49 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 37efb5f6e8..3add518d8c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -163,33 +163,34 @@ jobs: # - name: Run ${{ matrix.profile }} test # run: nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.profile }},docker - # tools: - # env: - # NXF_ANSI_LOG: false - # runs-on: ubuntu-latest - # strategy: - # matrix: - # tool: [Haplotypecaller, Freebayes, Manta, mpileup, MSIsensor, Strelka, TIDDIT] - # intervals: [--no_intervals, ''] - # exclude: - # - tool: Manta - # intervals: --no_intervals - # - tool: MSIsensor - # intervals: --no_intervals - # - tool: Strelka - # intervals: --no_intervals - # - tool: TIDDIT - # intervals: --no_intervals - # steps: - # - uses: actions/checkout@v2 - # - name: Install Nextflow - # run: | - # wget -qO- get.nextflow.io | bash - # sudo mv nextflow /usr/local/bin/ - # env: - # # Only check Nextflow pipeline minimum version - # NXF_VER: '20.07.1' - # - name: Pull docker image - # run: docker pull nfcore/sarek:dsl2 - # - name: Run ${{ matrix.tool }} test - # run: nextflow run ${GITHUB_WORKSPACE} -profile test_tool,docker --tools ${{ matrix.tool }} ${{ matrix.intervals }} + tools: + env: + NXF_ANSI_LOG: false + runs-on: ubuntu-latest + strategy: + matrix: + # tool: [Haplotypecaller, Freebayes, Manta, mpileup, MSIsensor, Strelka, TIDDIT] + tool: [Haplotypecaller] + intervals: [--no_intervals, ''] + # exclude: + # - tool: Manta + # intervals: --no_intervals + # - tool: MSIsensor + # intervals: --no_intervals + # - tool: Strelka + # intervals: --no_intervals + # - tool: TIDDIT + # intervals: --no_intervals + steps: + - uses: actions/checkout@v2 + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + env: + # Only check Nextflow pipeline minimum version + NXF_VER: '20.07.1' + - name: Pull docker image + run: docker pull nfcore/sarek:dsl2 + - name: Run ${{ matrix.tool }} test + run: nextflow run ${GITHUB_WORKSPACE} -profile test_tool,docker --tools ${{ matrix.tool }} ${{ matrix.intervals }} diff --git a/conf/base.config b/conf/base.config index 79e339bd98..94ca2c2a12 100644 --- a/conf/base.config +++ b/conf/base.config @@ -52,7 +52,7 @@ process { cache = false } - withName:CONCATVCF { + withName:CONCAT_VCF { // For unknown reasons, CONCATVCF sometimes fails with SIGPIPE // (exit code 141). Rerunning the process will usually work. errorStrategy = {task.exitStatus == 141 ? 'retry' : 'terminate'} diff --git a/conf/modules.config b/conf/modules.config index 75cfdfcd54..61db7c7c42 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -73,6 +73,18 @@ params { publish_dir_down = "Recalibrated" publish_results = "all" } + 'concat_vcf_haplotypecaller' { + args = "" + suffix = "" + publish_dir = "haplotypecaller" + publish_results = "all" + } + 'concat_vcf_haplotypecallergvcf' { + args = "" + suffix = ".g" + publish_dir = "haplotypecallergvcf" + publish_results = "all" + } 'gatk_markduplicates' { args = "ASSUME_SORTED=true REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT TMP_DIR=tmp" suffix = ".mLb.mkD" @@ -85,11 +97,5 @@ params { publish_dir = "pipeline_info" publish_results = "all" } - 'output_documentation' { - args = "" - suffix = "" - publish_dir = "pipeline_info" - publish_results = "all" - } } } diff --git a/main.nf b/main.nf index 4d83e145e2..e6f8f859dc 100644 --- a/main.nf +++ b/main.nf @@ -485,14 +485,16 @@ workflow { intervals, tools, target_bed, + dict, dbsnp, dbsnp_tbi, fasta, - fai) + fai, + modules) - if ('strelka' in tools) { - STRELKA(bam_variant_calling, fasta, fai, target_bed, modules['strelka']) - } + // if ('strelka' in tools) { + // STRELKA(bam_variant_calling, fasta, fai, target_bed, modules['strelka']) + // } /* ================================================================================ diff --git a/modules/local/subworkflow/germline_variant_calling.nf b/modules/local/subworkflow/germline_variant_calling.nf index 2af4190afb..7c7dbf2977 100644 --- a/modules/local/subworkflow/germline_variant_calling.nf +++ b/modules/local/subworkflow/germline_variant_calling.nf @@ -6,6 +6,8 @@ include { GATK_HAPLOTYPECALLER as HAPLOTYPECALLER } from '../../nf-core/software/gatk/haplotypecaller' include { GATK_GENOTYPEGVCF as GENOTYPEGVCF } from '../../nf-core/software/gatk/genotypegvcf' +include { CONCAT_VCF as CONCAT_GVCF; + CONCAT_VCF as CONCAT_HAPLOTYPECALLER} from '../process/concat_vcf' workflow GERMLINE_VARIANT_CALLING { @@ -14,10 +16,12 @@ workflow GERMLINE_VARIANT_CALLING { intervals // channel: [mandatory] intervals tools // list: [mandatory] list of tools target_bed // channel: [optional] target_bed + dict // channel: [mandatory] dict dbsnp // channel: [mandatory] dbsnp dbsnp_tbi // channel: [mandatory] dbsnp_tbi fasta // channel: [mandatory] fasta fai // channel: [mandatory] fai + modules // map: [mandatory] maps for modules main: @@ -36,24 +40,64 @@ workflow GERMLINE_VARIANT_CALLING { fasta, fai) + haplotypecallergvcf = HAPLOTYPECALLER.out.gvcf.map{ meta, vcf -> + patient = meta.patient + sample = meta.sample + gender = meta.gender + status = meta.status + [ patient, sample, gender, status, vcf] + }.groupTuple(by: [0,1]) + + haplotypecallergvcf = haplotypecallergvcf.map { patient, sample, gender, status, vcf -> + def meta = [:] + meta.patient = patient + meta.sample = sample + meta.gender = gender[0] + meta.status = status[0] + meta.id = meta.sample + [ meta, vcf ] + } + + CONCAT_GVCF( + haplotypecallergvcf, + fai, + target_bed, + modules['concat_vcf_haplotypecallergvcf']) // STEP GATK HAPLOTYPECALLER.2 + GENOTYPEGVCF( - HAPLOTYPECALLER.out.gvcfGenotypeGVCFs, + HAPLOTYPECALLER.out.interval_gvcf, dbsnp, dbsnp_tbi, dict, fasta, fai) - GENOTYPEGVCF.out.map{ name, meta, vcf -> + haplotypecallervcf = GENOTYPEGVCF.out.map{ meta, vcf -> patient = meta.patient sample = meta.sample gender = meta.gender status = meta.status - [name, patient, sample, gender, status, vcf] - }.groupTuple(by: [0,1,2,]) - .set{ vcfGenotypeGVCFs } + [ patient, sample, gender, status, vcf] + }.groupTuple(by: [0,1]) + + haplotypecallervcf = haplotypecallervcf.map { patient, sample, gender, status, vcf -> + def meta = [:] + meta.patient = patient + meta.sample = sample + meta.gender = gender[0] + meta.status = status[0] + meta.id = meta.sample + [ meta, vcf ] + } + + CONCAT_HAPLOTYPECALLER( + haplotypecallervcf, + fai, + target_bed, + modules['concat_vcf_haplotypecaller']) + } emit: diff --git a/modules/nf-core/software/gatk/genotypegvcf.nf b/modules/nf-core/software/gatk/genotypegvcf.nf index 07bff711f7..9c7533746c 100644 --- a/modules/nf-core/software/gatk/genotypegvcf.nf +++ b/modules/nf-core/software/gatk/genotypegvcf.nf @@ -14,7 +14,7 @@ process GATK_GENOTYPEGVCF { path fai output: - tuple val("HaplotypeCaller"), val(meta), path("${interval.baseName}_${meta.id}.vcf") + tuple val(meta), path("${interval.baseName}_${meta.id}.vcf") script: // Using -L is important for speed and we have to index the interval files also diff --git a/modules/nf-core/software/gatk/haplotypecaller.nf b/modules/nf-core/software/gatk/haplotypecaller.nf index 44d0adc16c..7ee415a432 100644 --- a/modules/nf-core/software/gatk/haplotypecaller.nf +++ b/modules/nf-core/software/gatk/haplotypecaller.nf @@ -17,8 +17,8 @@ process GATK_HAPLOTYPECALLER { path fai output: - tuple val("HaplotypeCallerGVCF"), val(meta), path("${interval.baseName}_${meta.id}.g.vcf"), emit: gvcfHaplotypeCaller - tuple val(meta), path(interval), path("${interval.baseName}_${meta.id}.g.vcf"), emit: gvcfGenotypeGVCFs + tuple val(meta), path("${interval.baseName}_${meta.id}.g.vcf"), emit: gvcf + tuple val(meta), path(interval), path("${interval.baseName}_${meta.id}.g.vcf"), emit: interval_gvcf script: From b235bec87fe08dd86196c05d35ba59d04b596723 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 6 Oct 2020 17:21:22 +0200 Subject: [PATCH 160/200] fix: forgot to add file... --- modules/local/process/concat_vcf.nf | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 modules/local/process/concat_vcf.nf diff --git a/modules/local/process/concat_vcf.nf b/modules/local/process/concat_vcf.nf new file mode 100644 index 0000000000..d0b4e4b8cd --- /dev/null +++ b/modules/local/process/concat_vcf.nf @@ -0,0 +1,28 @@ +process CONCAT_VCF { + label 'cpus_8' + + tag "${options.publish_dir}-${meta.id}" + + publishDir "${params.outdir}/VariantCalling/${meta.id}/${options.publish_dir}", mode: params.publish_dir_mode + + container "quay.io/biocontainers/htslib:1.11--hd3b49d5_0" + + conda (params.conda ? "bioconda::htslib=1.11" : null) + + input: + tuple val(meta), path(vcf) + path fai + path bed + val options + + output: + tuple val(meta), path("*_*.vcf.gz"), path("*_*.vcf.gz.tbi"), emit: vcf + + script: + name = options.suffix ? "${options.publish_dir}_${meta.id}${options.suffix}" : "${options.publish_dir}_${meta.id}" + target_options = params.target_bed ? "-t ${bed}" : "" + interval_options = params.no_intervals ? "-n" : "" + """ + concatenateVCFs.sh -i ${fai} -c ${task.cpus} -o ${name}.vcf ${target_options} ${interval_options} + """ +} \ No newline at end of file From 3a31ee26ce937d0afa5c3a4c7d1a070ddd35b107 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 10:03:33 +0200 Subject: [PATCH 161/200] move file and rename module to STRELKA_GERMLINE --- .../{strelka.nf => strelka/germline.nf} | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) rename modules/nf-core/software/{strelka.nf => strelka/germline.nf} (83%) diff --git a/modules/nf-core/software/strelka.nf b/modules/nf-core/software/strelka/germline.nf similarity index 83% rename from modules/nf-core/software/strelka.nf rename to modules/nf-core/software/strelka/germline.nf index e7e15db507..8640cf57f7 100644 --- a/modules/nf-core/software/strelka.nf +++ b/modules/nf-core/software/strelka/germline.nf @@ -1,11 +1,11 @@ // Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' +include { initOptions; saveFiles; getSoftwareName } from './../functions' -process STRELKA { +process STRELKA_GERMLINE { tag "$meta.id" - label 'cpus_max' - label 'memory_max' + label 'CPUS_MAX' + label 'MEMORY_MAX' publishDir "${params.outdir}", mode: params.publish_dir_mode, @@ -17,13 +17,14 @@ process STRELKA { input: tuple val(meta), path(bam), path (bai) - path(fasta) - path(fai) - path(target_bed) + path fasta + path fai + path target_bed val options output: - tuple val("Strelka"), val(meta), path("*.vcf.gz"), path("*.vcf.gz.tbi"), emit: vcfStrelkaSingle + tuple val(meta), path("*_variants.vcf.gz"), path("*_variants.vcf.gz.tbi"), emit: vcf + tuple val(meta), path("*_genome.vcf.gz"), path("*_genome.vcf.gz.tbi"), emit: genome_vcf path "*.version.txt", emit: version script: From 17d33fb7095e8ecc82bad0987be21824cf2ec3ef Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 10:03:52 +0200 Subject: [PATCH 162/200] remove comments --- main.nf | 4 ---- 1 file changed, 4 deletions(-) diff --git a/main.nf b/main.nf index e6f8f859dc..0baa584c97 100644 --- a/main.nf +++ b/main.nf @@ -492,10 +492,6 @@ workflow { fai, modules) - // if ('strelka' in tools) { - // STRELKA(bam_variant_calling, fasta, fai, target_bed, modules['strelka']) - // } - /* ================================================================================ SOMATIC VARIANT CALLING From a3671511fc331c3b60c6552fca9102c6ee95d743 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 10:04:21 +0200 Subject: [PATCH 163/200] update modules.config for strelka_germline --- conf/modules.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 61db7c7c42..11f83d2a27 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -32,11 +32,11 @@ params { publish_dir = "" publish_results = "all" } - 'strelka' { + 'strelka_germline' { args = "" extra = "" suffix = "" - publish_dir = "" + publish_dir = "strelka" publish_results = "all" } 'bwamem2_index' { From 3dd66cc313592838ac7d23a0b3ec1aee7d577a3a Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 10:04:36 +0200 Subject: [PATCH 164/200] add Strelka_germline --- .github/workflows/ci.yml | 8 +-- .../subworkflow/germline_variant_calling.nf | 64 ++++++++++++------- 2 files changed, 45 insertions(+), 27 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3add518d8c..54a8b97a8a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -170,15 +170,15 @@ jobs: strategy: matrix: # tool: [Haplotypecaller, Freebayes, Manta, mpileup, MSIsensor, Strelka, TIDDIT] - tool: [Haplotypecaller] + tool: [Haplotypecaller, Strelka] intervals: [--no_intervals, ''] - # exclude: + exclude: # - tool: Manta # intervals: --no_intervals # - tool: MSIsensor # intervals: --no_intervals - # - tool: Strelka - # intervals: --no_intervals + - tool: Strelka + intervals: --no_intervals # - tool: TIDDIT # intervals: --no_intervals steps: diff --git a/modules/local/subworkflow/germline_variant_calling.nf b/modules/local/subworkflow/germline_variant_calling.nf index 7c7dbf2977..8819ea0b95 100644 --- a/modules/local/subworkflow/germline_variant_calling.nf +++ b/modules/local/subworkflow/germline_variant_calling.nf @@ -8,39 +8,41 @@ include { GATK_HAPLOTYPECALLER as HAPLOTYPECALLER } from '../../nf-core/software include { GATK_GENOTYPEGVCF as GENOTYPEGVCF } from '../../nf-core/software/gatk/genotypegvcf' include { CONCAT_VCF as CONCAT_GVCF; CONCAT_VCF as CONCAT_HAPLOTYPECALLER} from '../process/concat_vcf' - +include { STRELKA_GERMLINE } from '../../nf-core/software/strelka/germline' workflow GERMLINE_VARIANT_CALLING { take: - bam_variant_calling // channel: [mandatory] bam - intervals // channel: [mandatory] intervals - tools // list: [mandatory] list of tools - target_bed // channel: [optional] target_bed - dict // channel: [mandatory] dict - dbsnp // channel: [mandatory] dbsnp - dbsnp_tbi // channel: [mandatory] dbsnp_tbi - fasta // channel: [mandatory] fasta - fai // channel: [mandatory] fai - modules // map: [mandatory] maps for modules + bam // channel: [mandatory] bam + intervals // channel: [mandatory] intervals + tools // list: [mandatory] list of tools + target_bed // channel: [optional] target_bed + dict // channel: [mandatory] dict + dbsnp // channel: [mandatory] dbsnp + dbsnp_tbi // channel: [mandatory] dbsnp_tbi + fasta // channel: [mandatory] fasta + fai // channel: [mandatory] fai + modules // map: [mandatory] maps for modules main: - vcfGenotypeGVCFs = Channel.empty() + haplotypecaller_gvcf = Channel.empty() + haplotypecaller_vcf = Channel.empty() + strelka_vcf = Channel.empty() if ('haplotypecaller' in tools) { - bam_haplotypecaller = bam_variant_calling.combine(intervals) + haplotypecaller_interval_bam = bam.combine(intervals) // STEP GATK HAPLOTYPECALLER.1 HAPLOTYPECALLER( - bam_haplotypecaller, + haplotypecaller_interval_bam, dbsnp, dbsnp_tbi, dict, fasta, fai) - haplotypecallergvcf = HAPLOTYPECALLER.out.gvcf.map{ meta, vcf -> + haplotypecaller_interval_gvcf = HAPLOTYPECALLER.out.gvcf.map{ meta, vcf -> patient = meta.patient sample = meta.sample gender = meta.gender @@ -48,7 +50,7 @@ workflow GERMLINE_VARIANT_CALLING { [ patient, sample, gender, status, vcf] }.groupTuple(by: [0,1]) - haplotypecallergvcf = haplotypecallergvcf.map { patient, sample, gender, status, vcf -> + haplotypecaller_interval_gvcf = haplotypecaller_interval_gvcf.map { patient, sample, gender, status, vcf -> def meta = [:] meta.patient = patient meta.sample = sample @@ -59,11 +61,13 @@ workflow GERMLINE_VARIANT_CALLING { } CONCAT_GVCF( - haplotypecallergvcf, + haplotypecaller_interval_gvcf, fai, target_bed, - modules['concat_vcf_haplotypecallergvcf']) - + modules['concat_vcf_haplotypecaller_gvcf']) + + haplotypecaller_gvcf = CONCAT_GVCF.out.vcf + // STEP GATK HAPLOTYPECALLER.2 GENOTYPEGVCF( @@ -74,7 +78,7 @@ workflow GERMLINE_VARIANT_CALLING { fasta, fai) - haplotypecallervcf = GENOTYPEGVCF.out.map{ meta, vcf -> + haplotypecaller_interval_vcf = GENOTYPEGVCF.out.map{ meta, vcf -> patient = meta.patient sample = meta.sample gender = meta.gender @@ -82,7 +86,7 @@ workflow GERMLINE_VARIANT_CALLING { [ patient, sample, gender, status, vcf] }.groupTuple(by: [0,1]) - haplotypecallervcf = haplotypecallervcf.map { patient, sample, gender, status, vcf -> + haplotypecaller_interval_vcf = haplotypecaller_interval_vcf.map { patient, sample, gender, status, vcf -> def meta = [:] meta.patient = patient meta.sample = sample @@ -93,13 +97,27 @@ workflow GERMLINE_VARIANT_CALLING { } CONCAT_HAPLOTYPECALLER( - haplotypecallervcf, + haplotypecaller_interval_vcf, fai, target_bed, modules['concat_vcf_haplotypecaller']) + haplotypecaller_vcf = CONCAT_GVCF.out.vcf + } + + if ('strelka' in tools) { + STRELKA_GERMLINE( + bam, + fasta, + fai, + target_bed, + modules['strelka_germline']) + + strelka_vcf = STRELKA_GERMLINE.out.vcf } emit: - vcfGenotypeGVCFs = vcfGenotypeGVCFs + haplotypecaller_gvcf = haplotypecaller_gvcf + haplotypecaller_vcf = haplotypecaller_vcf + strelka_vcf = strelka_vcf } From c67821c06749ccf6b93e7973002b5b57ca2a3ce1 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 10:14:29 +0200 Subject: [PATCH 165/200] fix: typo --- modules/local/subworkflow/germline_variant_calling.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/local/subworkflow/germline_variant_calling.nf b/modules/local/subworkflow/germline_variant_calling.nf index 8819ea0b95..0878ce642b 100644 --- a/modules/local/subworkflow/germline_variant_calling.nf +++ b/modules/local/subworkflow/germline_variant_calling.nf @@ -8,7 +8,7 @@ include { GATK_HAPLOTYPECALLER as HAPLOTYPECALLER } from '../../nf-core/software include { GATK_GENOTYPEGVCF as GENOTYPEGVCF } from '../../nf-core/software/gatk/genotypegvcf' include { CONCAT_VCF as CONCAT_GVCF; CONCAT_VCF as CONCAT_HAPLOTYPECALLER} from '../process/concat_vcf' -include { STRELKA_GERMLINE } from '../../nf-core/software/strelka/germline' +include { STRELKA_GERMLINE as STRELKA } from '../../nf-core/software/strelka/germline' workflow GERMLINE_VARIANT_CALLING { take: @@ -64,7 +64,7 @@ workflow GERMLINE_VARIANT_CALLING { haplotypecaller_interval_gvcf, fai, target_bed, - modules['concat_vcf_haplotypecaller_gvcf']) + modules['concat_vcf_haplotypecallergvcf']) haplotypecaller_gvcf = CONCAT_GVCF.out.vcf @@ -106,14 +106,14 @@ workflow GERMLINE_VARIANT_CALLING { } if ('strelka' in tools) { - STRELKA_GERMLINE( + STRELKA( bam, fasta, fai, target_bed, modules['strelka_germline']) - strelka_vcf = STRELKA_GERMLINE.out.vcf + strelka_vcf = STRELKA.out.vcf } emit: From 2a3105cf005fb3c112a4ee1875bec66224df5d3c Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 11:49:06 +0200 Subject: [PATCH 166/200] cleanup --- .vscode/settings.json | 1 - 1 file changed, 1 deletion(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 2cd8968be8..0000000000 --- a/.vscode/settings.json +++ /dev/null @@ -1 +0,0 @@ -123,125 \ No newline at end of file From af4ca34fa377b2e88c2371d525ad2a28c5a6c052 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 11:49:15 +0200 Subject: [PATCH 167/200] update containers --- containers/snpeff/Dockerfile | 2 +- containers/snpeff/environment.yml | 2 +- containers/vep/Dockerfile | 2 +- containers/vep/environment.yml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/containers/snpeff/Dockerfile b/containers/snpeff/Dockerfile index 9904cd90cd..8687437cff 100644 --- a/containers/snpeff/Dockerfile +++ b/containers/snpeff/Dockerfile @@ -1,4 +1,4 @@ -FROM nfcore/base:1.9 +FROM nfcore/base:1.10.2 LABEL \ author="Maxime Garcia" \ description="snpEff image for use in nf-core/sarek" \ diff --git a/containers/snpeff/environment.yml b/containers/snpeff/environment.yml index 1441f11664..f04b058328 100644 --- a/containers/snpeff/environment.yml +++ b/containers/snpeff/environment.yml @@ -1,5 +1,5 @@ # You can use this file to create a conda environment for this pipeline: -# conda env create -f environment.yml +# conda env create -f environment.yml name: nf-core-sarek-snpeff-3.0dev channels: - conda-forge diff --git a/containers/vep/Dockerfile b/containers/vep/Dockerfile index 06fc619aca..730d54db42 100644 --- a/containers/vep/Dockerfile +++ b/containers/vep/Dockerfile @@ -1,4 +1,4 @@ -FROM nfcore/base:1.9 +FROM nfcore/base:1.10.2 LABEL \ author="Maxime Garcia" \ description="VEP image for use in nf-core/sarek" \ diff --git a/containers/vep/environment.yml b/containers/vep/environment.yml index d657224355..a0095be4c5 100644 --- a/containers/vep/environment.yml +++ b/containers/vep/environment.yml @@ -1,5 +1,5 @@ # You can use this file to create a conda environment for this pipeline: -# conda env create -f environment.yml +# conda env create -f environment.yml name: nf-core-sarek-vep-3.0dev channels: - conda-forge From e78130dfb98bce2a95987d0655cbeb2743eddf51 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 11:49:58 +0200 Subject: [PATCH 168/200] sort parameters alphabetically --- main.nf | 61 +++++++++---------- modules/local/subworkflow/build_indices.nf | 4 +- .../subworkflow/germline_variant_calling.nf | 10 +-- modules/local/subworkflow/mapping.nf | 33 +++++----- modules/local/subworkflow/markduplicates.nf | 2 +- .../subworkflow/prepare_recalibration.nf | 4 +- modules/local/subworkflow/recalibrate.nf | 45 +++++++------- modules/nf-core/subworkflow/qc_trim.nf | 13 ++-- 8 files changed, 82 insertions(+), 90 deletions(-) diff --git a/main.nf b/main.nf index 0baa584c97..ac2c2520df 100644 --- a/main.nf +++ b/main.nf @@ -350,10 +350,10 @@ workflow { fasta, germline_resource, known_indels, + modules, pon, step, - tools, - modules['bwa_index']) + tools) intervals = BUILD_INDICES.out.intervals @@ -387,8 +387,7 @@ workflow { input_sample, ('fastqc' in skip_qc || step != "mapping"), !(params.trim_fastq), - modules['fastqc'], - modules['trimgalore']) + modules) reads_input = QC_TRIM.out.reads @@ -404,17 +403,16 @@ workflow { if (save_bam_mapped) modules['samtools_index_mapping'].publish_results = "all" MAPPING( - step, - reads_input, - target_bed, - bwa, - fasta, - fai, - modules['samtools_index_mapping'], - modules['merge_bam_mapping'], ('bamqc' in skip_qc), ('samtools' in skip_qc), - save_bam_mapped) + bwa, + fai, + fasta, + modules, + reads_input, + save_bam_mapped, + step, + target_bed) bam_mapped = MAPPING.out.bam bam_mapped_qc = MAPPING.out.qc @@ -424,8 +422,8 @@ workflow { // STEP 2: MARKING DUPLICATES MARKDUPLICATES( - step, - bam_mapped) + bam_mapped, + step) bam_markduplicates = MARKDUPLICATES.out.bam @@ -434,16 +432,16 @@ workflow { // STEP 3: CREATING RECALIBRATION TABLES PREPARE_RECALIBRATION( - step, bam_markduplicates, - intervals, dbsnp, dbsnp_tbi, dict, fai, fasta, + intervals, known_indels, - known_indels_tbi) + known_indels_tbi, + step) table_bqsr = PREPARE_RECALIBRATION.out.table_bqsr @@ -453,17 +451,16 @@ workflow { if (step == 'recalibrate') bam_applybqsr = input_sample RECALIBRATE( - step, + ('bamqc' in skip_qc), + ('samtools' in skip_qc), bam_applybqsr, - intervals, - target_bed, dict, - fasta, fai, - modules['samtools_index_recalibrate'], - modules['merge_bam_recalibrate'], - ('bamqc' in skip_qc), - ('samtools' in skip_qc)) + fasta, + intervals, + modules, + step, + target_bed) bam_recalibrated = RECALIBRATE.out.bam bam_recalibrated_qc = RECALIBRATE.out.qc @@ -482,15 +479,15 @@ workflow { GERMLINE_VARIANT_CALLING( bam_variant_calling, - intervals, - tools, - target_bed, - dict, dbsnp, dbsnp_tbi, - fasta, + dict, fai, - modules) + fasta, + intervals, + modules, + target_bed, + tools) /* ================================================================================ diff --git a/modules/local/subworkflow/build_indices.nf b/modules/local/subworkflow/build_indices.nf index e5919aca9f..488b9de656 100644 --- a/modules/local/subworkflow/build_indices.nf +++ b/modules/local/subworkflow/build_indices.nf @@ -23,17 +23,17 @@ workflow BUILD_INDICES{ fasta // channel: [mandatory] fasta germline_resource // channel: [optional] germline_resource known_indels // channel: [optional] known_indels + modules // map: [mandatory] options for modules pon // channel: [optional] pon step // value: [mandatory] starting step tools // list: [optional] tools to run - bwa_index_opts // map: options for BWA_INDEX module main: result_bwa = Channel.empty() version_bwa = Channel.empty() if (!(params.bwa) && 'mapping' in step) - if (params.aligner == "bwa-mem") (result_bwa, version_bwa) = BWA_INDEX(fasta, bwa_index_opts) + if (params.aligner == "bwa-mem") (result_bwa, version_bwa) = BWA_INDEX(fasta, modules['bwa_index']) else result_bwa = BWAMEM2_INDEX(fasta) result_dict = Channel.empty() diff --git a/modules/local/subworkflow/germline_variant_calling.nf b/modules/local/subworkflow/germline_variant_calling.nf index 0878ce642b..ba48c158aa 100644 --- a/modules/local/subworkflow/germline_variant_calling.nf +++ b/modules/local/subworkflow/germline_variant_calling.nf @@ -13,15 +13,15 @@ include { STRELKA_GERMLINE as STRELKA } from '../../nf-core/software workflow GERMLINE_VARIANT_CALLING { take: bam // channel: [mandatory] bam - intervals // channel: [mandatory] intervals - tools // list: [mandatory] list of tools - target_bed // channel: [optional] target_bed - dict // channel: [mandatory] dict dbsnp // channel: [mandatory] dbsnp dbsnp_tbi // channel: [mandatory] dbsnp_tbi - fasta // channel: [mandatory] fasta + dict // channel: [mandatory] dict fai // channel: [mandatory] fai + fasta // channel: [mandatory] fasta + intervals // channel: [mandatory] intervals modules // map: [mandatory] maps for modules + target_bed // channel: [optional] target_bed + tools // list: [mandatory] list of tools main: diff --git a/modules/local/subworkflow/mapping.nf b/modules/local/subworkflow/mapping.nf index 6f3d1d5428..b1396dc735 100644 --- a/modules/local/subworkflow/mapping.nf +++ b/modules/local/subworkflow/mapping.nf @@ -13,34 +13,31 @@ include { SAMTOOLS_STATS } from '../../nf-core/software/samtools/stats' workflow MAPPING { take: - step // value: [mandatory] starting step - input_sample // channel: [mandatory] input_sample - target_bed // channel: [optional] target_bed - bwa // channel: [mandatory] bwa - fasta // channel: [mandatory] fasta - fai // channel: [mandatory] fai - samtools_opts // map: options for SAMTOOLS_INDEX module - merge_bam_opts // map: options for MERGE_BAM module skip_bamqc // boolean: true/false skip_samtools // boolean: true/false + bwa // channel: [mandatory] bwa + fai // channel: [mandatory] fai + fasta // channel: [mandatory] fasta + modules // map: options for modules + reads_input // channel: [mandatory] reads_input save_bam_mapped // boolean: true/false + step // value: [mandatory] starting step + target_bed // channel: [optional] target_bed main: - bam_mapped_indexed = Channel.empty() - bam_reports = Channel.empty() + bam_mapped_index = Channel.empty() + bam_reports = Channel.empty() if (step == "mapping") { - reads_input = input_sample - bam_bwamem1 = Channel.empty() bam_bwamem2 = Channel.empty() if (params.aligner == "bwa-mem") { - BWAMEM1_MEM(reads_input, bwa, fasta, fai, params.modules['bwa_mem']) + BWAMEM1_MEM(reads_input, bwa, fasta, fai, modules['bwa_mem']) bam_bwamem1 = BWAMEM1_MEM.out.bam } else { - BWAMEM2_MEM(reads_input, bwa, fasta, fai, params.modules['bwamem2_mem']) + BWAMEM2_MEM(reads_input, bwa, fasta, fai, modules['bwamem2_mem']) bam_bwamem2 = BWAMEM2_MEM.out } @@ -86,9 +83,9 @@ workflow MAPPING { // STEP 1.5: MERGING AND INDEXING BAM FROM MULTIPLE LANES - MERGE_BAM(bam_bwa_multiple, merge_bam_opts) - bam_mapped = bam_bwa_single.mix(MERGE_BAM.out.bam) - bam_mapped_indexed = SAMTOOLS_INDEX(bam_mapped, samtools_opts) + MERGE_BAM(bam_bwa_multiple, modules['merge_bam_mapping']) + bam_mapped = bam_bwa_single.mix(MERGE_BAM.out.bam) + bam_mapped_index = SAMTOOLS_INDEX(bam_mapped, modules['samtools_index_mapping']) qualimap_bamqc = Channel.empty() samtools_stats = Channel.empty() @@ -131,6 +128,6 @@ workflow MAPPING { } emit: - bam = bam_mapped_indexed + bam = bam_mapped_index qc = bam_reports } diff --git a/modules/local/subworkflow/markduplicates.nf b/modules/local/subworkflow/markduplicates.nf index 6ac36299ad..ba7f787f30 100644 --- a/modules/local/subworkflow/markduplicates.nf +++ b/modules/local/subworkflow/markduplicates.nf @@ -8,8 +8,8 @@ include { GATK_MARKDUPLICATES } from '../../nf-core/software/gatk/markduplicates workflow MARKDUPLICATES { take: - step // value: [mandatory] starting step bam_mapped // channel: [mandatory] bam_mapped + step // value: [mandatory] starting step main: diff --git a/modules/local/subworkflow/prepare_recalibration.nf b/modules/local/subworkflow/prepare_recalibration.nf index a6dec6dd2c..b56061503f 100644 --- a/modules/local/subworkflow/prepare_recalibration.nf +++ b/modules/local/subworkflow/prepare_recalibration.nf @@ -9,16 +9,16 @@ include { GATK_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from '../../nf-core/soft workflow PREPARE_RECALIBRATION { take: - step // value: [mandatory] starting step bam_markduplicates // channel: [mandatory] bam_markduplicates - intervals // channel: [mandatory] intervals dbsnp // channel: [optional] dbsnp dbsnp_tbi // channel: [optional] dbsnp_tbi dict // channel: [mandatory] dict fai // channel: [mandatory] fai fasta // channel: [mandatory] fasta + intervals // channel: [mandatory] intervals known_indels // channel: [optional] known_indels known_indels_tbi // channel: [optional] known_indels_tbi + step // value: [mandatory] starting step main: diff --git a/modules/local/subworkflow/recalibrate.nf b/modules/local/subworkflow/recalibrate.nf index ab2e28667e..b42a50846d 100644 --- a/modules/local/subworkflow/recalibrate.nf +++ b/modules/local/subworkflow/recalibrate.nf @@ -12,34 +12,33 @@ include { QUALIMAP_BAMQC } from '../../nf-core/software/qualimap_ba workflow RECALIBRATE { take: - step // value: [mandatory] starting step - bam_applybqsr // channel: [mandatory] bam_applybqsr - intervals // channel: [mandatory] intervals - target_bed // channel: [optional] target_bed - dict // channel: [mandatory] dict - fasta // channel: [mandatory] fasta - fai // channel: [mandatory] fai - samtools_opts // map: options for SAMTOOLS_INDEX module - merge_bam_opts // map: options for MERGE_BAM module skip_bamqc // boolean: true/false skip_samtools // boolean: true/false + bam // channel: [mandatory] bam + dict // channel: [mandatory] dict + fai // channel: [mandatory] fai + fasta // channel: [mandatory] fasta + intervals // channel: [mandatory] intervals + modules // map: options for modules + step // value: [mandatory] starting step + target_bed // channel: [optional] target_bed main: - bam_recalibrated_indexed = Channel.empty() - bam_recalibrated = Channel.empty() - bam_reports = Channel.empty() + bam_recalibrated_index = Channel.empty() + bam_recalibrated = Channel.empty() + bam_reports = Channel.empty() if (step in ["mapping", "preparerecalibration", "recalibrate"]) { - bam_applybqsr = bam_applybqsr.combine(intervals) + bam_intervals = bam.combine(intervals) - APPLYBQSR(bam_applybqsr, dict, fasta, fai) + APPLYBQSR(bam_intervals, dict, fasta, fai) // STEP 4.5: MERGING AND INDEXING THE RECALIBRATED BAM FILES if (params.no_intervals) { - bam_recalibrated = APPLYBQSR.out.bam - tsv_recalibrated = APPLYBQSR.out.tsv + bam_recalibrated = APPLYBQSR.out.bam + tsv_recalibrated = APPLYBQSR.out.tsv } else { APPLYBQSR.out.bam.map{ meta, bam -> //, bai -> patient = meta.patient @@ -47,9 +46,9 @@ workflow RECALIBRATE { gender = meta.gender status = meta.status [patient, sample, gender, status, bam] //, bai] - }.groupTuple(by: [0,1]).set{ bam_recal_to_merge } + }.groupTuple(by: [0,1]).set{ bam_recalibrated_interval } - bam_recal_to_merge = bam_recal_to_merge.map { + bam_recalibrated_interval = bam_recalibrated_interval.map { patient, sample, gender, status, bam -> //, bai -> def meta = [:] @@ -62,12 +61,12 @@ workflow RECALIBRATE { [meta, bam] } - MERGE_BAM(bam_recal_to_merge, merge_bam_opts) - bam_recalibrated = MERGE_BAM.out.bam - tsv_recalibrated = MERGE_BAM.out.tsv + MERGE_BAM(bam_recalibrated_interval, modules['merge_bam_recalibrate']) + bam_recalibrated = MERGE_BAM.out.bam + tsv_recalibrated = MERGE_BAM.out.tsv } - bam_recalibrated_indexed = SAMTOOLS_INDEX(bam_recalibrated, samtools_opts) + bam_recalibrated_index = SAMTOOLS_INDEX(bam_recalibrated, modules['samtools_index_recalibrate']) qualimap_bamqc = Channel.empty() samtools_stats = Channel.empty() @@ -116,6 +115,6 @@ workflow RECALIBRATE { } emit: - bam = bam_recalibrated + bam = bam_recalibrated_index qc = bam_reports } diff --git a/modules/nf-core/subworkflow/qc_trim.nf b/modules/nf-core/subworkflow/qc_trim.nf index 1c6b16b863..961065a864 100644 --- a/modules/nf-core/subworkflow/qc_trim.nf +++ b/modules/nf-core/subworkflow/qc_trim.nf @@ -8,11 +8,10 @@ include { TRIMGALORE } from '../software/trimgalore' workflow QC_TRIM { take: - reads // channel: [ val(meta), [ reads ] ] - skip_fastqc // boolean: true/false - skip_trimming // boolean: true/false - fastqc_opts // map: options for FastQC module - trimgalore_opts // map: options for TrimGalore! module + reads // channel: [ val(meta), [ reads ] ] + skip_fastqc // boolean: true/false + skip_trimming // boolean: true/false + modules // map: options for modules main: @@ -20,7 +19,7 @@ workflow QC_TRIM { fastqc_version = Channel.empty() fastqc_zip = Channel.empty() if (!skip_fastqc) { - FASTQC(reads, fastqc_opts) + FASTQC(reads, modules['fastqc']) fastqc_html = FASTQC.out.html fastqc_version = FASTQC.out.version fastqc_zip = FASTQC.out.zip @@ -32,7 +31,7 @@ workflow QC_TRIM { trimgalore_log = Channel.empty() trimgalore_version = Channel.empty() if (!skip_trimming) { - TRIMGALORE(reads, trimgalore_opts) + TRIMGALORE(reads, modules['trimgalore']) trim_reads = TRIMGALORE.out.reads trimgalore_html = TRIMGALORE.out.html trimgalore_zip = TRIMGALORE.out.zip From 2e2d4a2dd1b3c3686c7500e165ca67a0015dadbf Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 16:36:47 +0200 Subject: [PATCH 169/200] fix: subject -> patient --- modules/local/functions.nf | 94 +++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 46 deletions(-) diff --git a/modules/local/functions.nf b/modules/local/functions.nf index 9b8717a9b5..7293db1f18 100644 --- a/modules/local/functions.nf +++ b/modules/local/functions.nf @@ -86,7 +86,7 @@ def define_tool_list() { } // Channeling the TSV file containing BAM. -// Format is: "subject gender status sample bam bai" +// Format is: "patient gender status sample bam bai" def extract_bam(tsvFile) { Channel.from(tsvFile) .splitCsv(sep: '\t') @@ -119,6 +119,8 @@ def extract_fastq_from_dir(folder) { fastq = Channel.fromFilePairs(folder + '/*{_R1_,_R2_}*.fastq.gz') .ifEmpty { error "No directories found matching folder '${folder}'" } +// TODO check if flowcellLane_from_fastq is useful or not + fastq = fastq.map{ run, pair -> def meta = [:] meta.patient = sample @@ -135,8 +137,8 @@ def extract_fastq_from_dir(folder) { } // Channeling the TSV file containing FASTQ or BAM -// Format is: "subject gender status sample lane fastq1 fastq2" -// or: "subject gender status sample lane bam" +// Format is: "patient gender status sample lane fastq1 fastq2" +// or: "patient gender status sample lane bam" def extract_fastq(tsvFile) { Channel.from(tsvFile) .splitCsv(sep: '\t') @@ -165,27 +167,27 @@ def extract_fastq(tsvFile) { } } -// Channeling the TSV file containing mpileup -// Format is: "subject gender status sample pileup" -def extract_pileup(tsvFile) { - Channel.from(tsvFile) - .splitCsv(sep: '\t') - .map { row -> - check_number_of_item(row, 5) - def idPatient = row[0] - def gender = row[1] - def status = return_status(row[2].toInteger()) - def idSample = row[3] - def mpileup = return_file(row[4]) +// // Channeling the TSV file containing mpileup +// // Format is: "patient gender status sample pileup" +// def extract_pileup(tsvFile) { +// Channel.from(tsvFile) +// .splitCsv(sep: '\t') +// .map { row -> +// check_number_of_item(row, 5) +// def idPatient = row[0] +// def gender = row[1] +// def status = return_status(row[2].toInteger()) +// def idSample = row[3] +// def mpileup = return_file(row[4]) - if (!has_extension(mpileup, "pileup")) exit 1, "File: ${mpileup} has the wrong extension. See --help for more information" +// if (!has_extension(mpileup, "pileup")) exit 1, "File: ${mpileup} has the wrong extension. See --help for more information" - return [idPatient, gender, status, idSample, mpileup] - } -} +// return [idPatient, gender, status, idSample, mpileup] +// } +// } // Channeling the TSV file containing Recalibration Tables. -// Format is: "subject gender status sample bam bai recalTable" +// Format is: "patient gender status sample bam bai recalTable" def extract_recal(tsvFile) { Channel.from(tsvFile) .splitCsv(sep: '\t') @@ -210,32 +212,32 @@ def extract_recal(tsvFile) { } } -// Parse first line of a FASTQ file, return the flowcell id and lane number. -def flowcellLane_from_fastq(path) { - // expected format: - // xx:yy:FLOWCELLID:LANE:... (seven fields) - // or - // FLOWCELLID:LANE:xx:... (five fields) - InputStream fileStream = new FileInputStream(path.toFile()) - InputStream gzipStream = new java.util.zip.GZIPInputStream(fileStream) - Reader decoder = new InputStreamReader(gzipStream, 'ASCII') - BufferedReader buffered = new BufferedReader(decoder) - def line = buffered.readLine() - assert line.startsWith('@') - line = line.substring(1) - def fields = line.split(' ')[0].split(':') - String fcid - int lane - if (fields.size() == 7) { - // CASAVA 1.8+ format - fcid = fields[2] - lane = fields[3].toInteger() - } else if (fields.size() == 5) { - fcid = fields[0] - lane = fields[1].toInteger() - } - [fcid, lane] -} +// // Parse first line of a FASTQ file, return the flowcell id and lane number. +// def flowcellLane_from_fastq(path) { +// // expected format: +// // xx:yy:FLOWCELLID:LANE:... (seven fields) +// // or +// // FLOWCELLID:LANE:xx:... (five fields) +// InputStream fileStream = new FileInputStream(path.toFile()) +// InputStream gzipStream = new java.util.zip.GZIPInputStream(fileStream) +// Reader decoder = new InputStreamReader(gzipStream, 'ASCII') +// BufferedReader buffered = new BufferedReader(decoder) +// def line = buffered.readLine() +// assert line.startsWith('@') +// line = line.substring(1) +// def fields = line.split(' ')[0].split(':') +// String fcid +// int lane +// if (fields.size() == 7) { +// // CASAVA 1.8+ format +// fcid = fields[2] +// lane = fields[3].toInteger() +// } else if (fields.size() == 5) { +// fcid = fields[0] +// lane = fields[1].toInteger() +// } +// [fcid, lane] +// } // Check file extension def has_extension(it, extension) { From 3675a9c45a790e00c6b48d01b1d7b50f593e3319 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 16:45:47 +0200 Subject: [PATCH 170/200] fix: update container for process needing only awk --- modules/local/process/build_intervals.nf | 4 ++-- modules/local/process/create_intervals_bed.nf | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/modules/local/process/build_intervals.nf b/modules/local/process/build_intervals.nf index 01918147a7..65d3cfb600 100644 --- a/modules/local/process/build_intervals.nf +++ b/modules/local/process/build_intervals.nf @@ -4,9 +4,9 @@ process BUILD_INTERVALS { publishDir params.outdir, mode: params.publish_dir_mode, saveAs: {params.save_reference ? "reference_genome/${it}" : null } - container "nfcore/sarek:dsl2" + container "biocontainers/biocontainers:v1.2.0_cv1" - conda (params.conda ? "$baseDir/environment.yml" : null) + conda (params.conda ? "conda-forge::sed=4.7" : null) input: path fai diff --git a/modules/local/process/create_intervals_bed.nf b/modules/local/process/create_intervals_bed.nf index 4e93264a92..05fc396be4 100644 --- a/modules/local/process/create_intervals_bed.nf +++ b/modules/local/process/create_intervals_bed.nf @@ -3,6 +3,10 @@ include { has_extension } from '../functions' process CREATE_INTERVALS_BED { tag "${intervals}" + container "biocontainers/biocontainers:v1.2.0_cv1" + + conda (params.conda ? "conda-forge::sed=4.7" : null) + input: path intervals From 803b50315c22ce1ee1629907781a50d560ef8176 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 16:46:13 +0200 Subject: [PATCH 171/200] fix: call to BUILD_INTERVALS process --- modules/local/subworkflow/build_indices.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/subworkflow/build_indices.nf b/modules/local/subworkflow/build_indices.nf index 488b9de656..589e6db947 100644 --- a/modules/local/subworkflow/build_indices.nf +++ b/modules/local/subworkflow/build_indices.nf @@ -65,7 +65,7 @@ workflow BUILD_INDICES{ result_intervals = Channel.from(file("${params.outdir}/no_intervals.bed")) } else if (!('annotate' in step) && !('controlfreec' in step)) if (!params.intervals) - result_intervals = CREATE_INTERVALS_BED(result_fai) + result_intervals = CREATE_INTERVALS_BED(BUILD_INTERVALS(result_fai)) else result_intervals = CREATE_INTERVALS_BED(file(params.intervals)) From 2c38b1266dc1bca4063f4869392825234b257c09 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Wed, 7 Oct 2020 16:56:25 +0200 Subject: [PATCH 172/200] Apply suggestions from code review Co-authored-by: FriederikeHanssen --- modules/local/process/merge_bam.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/process/merge_bam.nf b/modules/local/process/merge_bam.nf index 429f49d2a1..3c952084e4 100644 --- a/modules/local/process/merge_bam.nf +++ b/modules/local/process/merge_bam.nf @@ -13,11 +13,11 @@ process MERGE_BAM { output: tuple val(meta), path("${name}.bam"), emit: bam - val meta, emit: tsv + val meta, emit: tsv script: name = options.suffix ? "${meta.id}.${options.suffix}" : "${meta.id}" """ samtools merge --threads ${task.cpus} ${name}.bam ${bam} """ -} \ No newline at end of file +} From 4ff2f821c53a7daad11b2f9385ea7632f487610d Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Wed, 7 Oct 2020 17:00:56 +0200 Subject: [PATCH 173/200] Update main.nf Co-authored-by: FriederikeHanssen --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index ac2c2520df..30545742df 100644 --- a/main.nf +++ b/main.nf @@ -381,7 +381,7 @@ workflow { // STEP 0: QC & TRIM // `--skip_qc fastqc` to skip fastqc // trim only with `--trim_fastq` - // addtional options to be set up + // additional options to be set up QC_TRIM( input_sample, From f9121b6f12978f56ba3d391f4e8452f273a9f690 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 17:05:21 +0200 Subject: [PATCH 174/200] fix: sort out params --- conf/modules.config | 70 ++++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 11f83d2a27..39dc11f1bc 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -6,25 +6,27 @@ params { modules { - 'fastqc' { - args = "--quiet" + 'bwa_index' { + args = "" suffix = "" - publish_dir = "Reports/FastQC" + publish_dir = "genome/bwa_index" publish_results = "all" } - 'trimgalore' { - args = "--fastqc" + 'bwa_mem' { + args = "-K 100000000 -M" + args2 = "" + extra = "" suffix = "" - publish_dir = "trimgalore" + publish_dir = "" publish_results = "all" } - 'bwa_index' { + 'bwamem2_index' { args = "" suffix = "" publish_dir = "genome/bwa_index" publish_results = "all" } - 'bwa_mem' { + 'bwamem2_mem' { args = "-K 100000000 -M" args2 = "" extra = "" @@ -32,25 +34,34 @@ params { publish_dir = "" publish_results = "all" } - 'strelka_germline' { + 'concat_vcf_haplotypecaller' { args = "" - extra = "" suffix = "" - publish_dir = "strelka" + publish_dir = "haplotypecaller" publish_results = "all" } - 'bwamem2_index' { + 'concat_vcf_haplotypecallergvcf' { args = "" + suffix = ".g" + publish_dir = "haplotypecallergvcf" + publish_results = "all" + } + 'fastqc' { + args = "--quiet" suffix = "" - publish_dir = "genome/bwa_index" + publish_dir = "Reports/FastQC" publish_results = "all" } - 'bwamem2_mem' { - args = "-K 100000000 -M" - args2 = "" - extra = "" + 'gatk_markduplicates' { + args = "ASSUME_SORTED=true REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT TMP_DIR=tmp" + suffix = ".mLb.mkD" + publish_dir = "bwa/mergedLibrary" + publish_results = "all" + } + 'get_software_versions' { + args = "" suffix = "" - publish_dir = "" + publish_dir = "pipeline_info" publish_results = "all" } 'merge_bam_mapping' { @@ -73,28 +84,17 @@ params { publish_dir_down = "Recalibrated" publish_results = "all" } - 'concat_vcf_haplotypecaller' { + 'strelka_germline' { args = "" + extra = "" suffix = "" - publish_dir = "haplotypecaller" - publish_results = "all" - } - 'concat_vcf_haplotypecallergvcf' { - args = "" - suffix = ".g" - publish_dir = "haplotypecallergvcf" - publish_results = "all" - } - 'gatk_markduplicates' { - args = "ASSUME_SORTED=true REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT TMP_DIR=tmp" - suffix = ".mLb.mkD" - publish_dir = "bwa/mergedLibrary" + publish_dir = "strelka" publish_results = "all" } - 'get_software_versions' { - args = "" + 'trimgalore' { + args = "--fastqc" suffix = "" - publish_dir = "pipeline_info" + publish_dir = "trimgalore" publish_results = "all" } } From 80fc32f1d3b01808e3c4158169c99aad2479c138 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 17:08:43 +0200 Subject: [PATCH 175/200] fix: complete params --- conf/modules.config | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 39dc11f1bc..a676ea488a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -8,6 +8,8 @@ params { modules { 'bwa_index' { args = "" + args2 = "" + extra = "" suffix = "" publish_dir = "genome/bwa_index" publish_results = "all" @@ -18,12 +20,14 @@ params { extra = "" suffix = "" publish_dir = "" - publish_results = "all" + publish_results = "none" } 'bwamem2_index' { args = "" + args2 = "" + extra = "" suffix = "" - publish_dir = "genome/bwa_index" + publish_dir = "genome/bwamem2_index" publish_results = "all" } 'bwamem2_mem' { @@ -32,60 +36,87 @@ params { extra = "" suffix = "" publish_dir = "" - publish_results = "all" + publish_results = "none" } 'concat_vcf_haplotypecaller' { args = "" + args2 = "" + extra = "" suffix = "" publish_dir = "haplotypecaller" publish_results = "all" } 'concat_vcf_haplotypecallergvcf' { args = "" + args2 = "" + extra = "" suffix = ".g" publish_dir = "haplotypecallergvcf" publish_results = "all" } 'fastqc' { args = "--quiet" + args = "" + extra = "" suffix = "" publish_dir = "Reports/FastQC" publish_results = "all" } 'gatk_markduplicates' { args = "ASSUME_SORTED=true REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT TMP_DIR=tmp" - suffix = ".mLb.mkD" + args2 = "" + extra = "" + suffix = ".md" publish_dir = "bwa/mergedLibrary" publish_results = "all" } 'get_software_versions' { args = "" + args2 = "" + extra = "" suffix = "" publish_dir = "pipeline_info" publish_results = "all" } 'merge_bam_mapping' { + args = "" + args2 = "" + extra = "" suffix = "" + publish_dir = "" + publish_results = "none" } 'merge_bam_recalibrate' { + args = "" + args2 = "" + extra = "" suffix = "md" + publish_dir = "" + publish_results = "none" } 'samtools_index_mapping' { args = "" + args2 = "" + extra = "" suffix = "" + publish_dir = "" publish_dir_up = "Preprocessing" publish_dir_down = "Mapped" publish_results = "none" } 'samtools_index_recalibrate' { args = "" + args2 = "" + extra = "" suffix = "md" + publish_dir = "" publish_dir_up = "Preprocessing" publish_dir_down = "Recalibrated" publish_results = "all" } 'strelka_germline' { args = "" + args2 = "" extra = "" suffix = "" publish_dir = "strelka" @@ -93,6 +124,8 @@ params { } 'trimgalore' { args = "--fastqc" + args2 = "" + extra = "" suffix = "" publish_dir = "trimgalore" publish_results = "all" From eddce147cab6d633f95a564915fb7f039f73b0ef Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 17:28:45 +0200 Subject: [PATCH 176/200] add: .vscode --- .gitignore | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index d4c81ee6b5..f567ae9a29 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,10 @@ +*.pyc +.DS_Store .nextflow* -work/ +.vscode data/ references/ results/ -.DS_Store -tests/ testing/ -*.pyc +tests/ +work/ From f0c32c451429ead78d61e96ab2d6438ed3171717 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 17:40:55 +0200 Subject: [PATCH 177/200] reorganize tsv files and some output directory --- conf/modules.config | 8 +- main.nf | 92 ++----------------- modules/local/subworkflow/build_indices.nf | 2 +- modules/local/subworkflow/mapping.nf | 12 +-- modules/local/subworkflow/markduplicates.nf | 36 ++++---- .../subworkflow/prepare_recalibration.nf | 14 +-- modules/local/subworkflow/recalibrate.nf | 12 +-- modules/nf-core/software/bwamem2_index.nf | 1 + .../nf-core/software/gatk/markduplicates.nf | 4 +- 9 files changed, 55 insertions(+), 126 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index a676ea488a..512a7a6dd6 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -100,8 +100,8 @@ params { extra = "" suffix = "" publish_dir = "" - publish_dir_up = "Preprocessing" - publish_dir_down = "Mapped" + publish_dir_up = "preprocessing" + publish_dir_down = "mapped" publish_results = "none" } 'samtools_index_recalibrate' { @@ -110,8 +110,8 @@ params { extra = "" suffix = "md" publish_dir = "" - publish_dir_up = "Preprocessing" - publish_dir_down = "Recalibrated" + publish_dir_up = "preprocessing" + publish_dir_down = "recalibrated" publish_results = "all" } 'strelka_germline' { diff --git a/main.nf b/main.nf index 30545742df..6b6e6525a2 100644 --- a/main.nf +++ b/main.nf @@ -78,9 +78,6 @@ if (params.genomes && !params.genomes.containsKey(params.genome) && !params.igen step_list = define_step_list() step = params.step ? params.step.toLowerCase().replaceAll('-', '').replaceAll('_', '') : '' -// Handle deprecation -if (step == 'preprocessing') step = 'mapping' - if (step.contains(',')) exit 1, 'You can choose only one step, see --help for more information' if (!check_parameter_existence(step, step_list)) exit 1, "Unknown step ${step}, see --help for more information" @@ -114,28 +111,28 @@ save_bam_mapped = params.skip_markduplicates ? true : params.save_bam_mapped ? t if (!params.input && params.sentieon) { switch (step) { case 'mapping': break - case 'recalibrate': tsv_path = "${params.outdir}/Preprocessing/TSV/sentieon_deduped.tsv"; break - case 'variantcalling': tsv_path = "${params.outdir}/Preprocessing/TSV/sentieon_recalibrated.tsv"; break + case 'recalibrate': tsv_path = "${params.outdir}/preprocessing/tsv/sentieon_deduped.tsv"; break + case 'variantcalling': tsv_path = "${params.outdir}/preprocessing/tsv/sentieon_recalibrated.tsv"; break case 'annotate': break default: exit 1, "Unknown step ${step}" } } else if (!params.input && !params.sentieon && !params.skip_markduplicates) { switch (step) { case 'mapping': break - case 'preparerecalibration': tsv_path = "${params.outdir}/Preprocessing/TSV/duplicates_marked_no_table.tsv"; break - case 'recalibrate': tsv_path = "${params.outdir}/Preprocessing/TSV/duplicates_marked.tsv"; break - case 'variantcalling': tsv_path = "${params.outdir}/Preprocessing/TSV/recalibrated.tsv"; break - case 'controlfreec': tsv_path = "${params.outdir}/VariantCalling/TSV/control-freec_mpileup.tsv"; break + case 'preparerecalibration': tsv_path = "${params.outdir}/preprocessing/tsv/markduplicates_no_table.tsv"; break + case 'recalibrate': tsv_path = "${params.outdir}/preprocessing/tsv/markduplicates.tsv"; break + case 'variantcalling': tsv_path = "${params.outdir}/preprocessing/tsv/recalibrated.tsv"; break + case 'controlfreec': tsv_path = "${params.outdir}/variant_calling/tsv/control-freec_mpileup.tsv"; break case 'annotate': break default: exit 1, "Unknown step ${step}" } } else if (!params.input && !params.sentieon && params.skip_markduplicates) { switch (step) { case 'mapping': break - case 'preparerecalibration': tsv_path = "${params.outdir}/Preprocessing/TSV/mapped.tsv"; break - case 'recalibrate': tsv_path = "${params.outdir}/Preprocessing/TSV/mapped_no_duplicates_marked.tsv"; break - case 'variantcalling': tsv_path = "${params.outdir}/Preprocessing/TSV/recalibrated.tsv"; break - case 'controlfreec': tsv_path = "${params.outdir}/VariantCalling/TSV/control-freec_mpileup.tsv"; break + case 'preparerecalibration': tsv_path = "${params.outdir}/preprocessing/tsv/mapped.tsv"; break + case 'recalibrate': tsv_path = "${params.outdir}/preprocessing/tsv/mapped_no_markduplicates.tsv"; break + case 'variantcalling': tsv_path = "${params.outdir}/preprocessing/tsv/recalibrated.tsv"; break + case 'controlfreec': tsv_path = "${params.outdir}/variant_calling/tsv/control-freec_mpileup.tsv"; break case 'annotate': break default: exit 1, "Unknown step ${step}" } @@ -536,75 +533,6 @@ workflow.onComplete { // ================================================================================ // */ - - - -// // Here we have a recalibrated bam set -// // The TSV file is formatted like: "idPatient status idSample bamFile baiFile" -// // Manta will be run in Germline mode, or in Tumor mode depending on status -// // HaplotypeCaller, TIDDIT and Strelka will be run for Normal and Tumor samples - -// (bamMantaSingle, bamStrelkaSingle, bamTIDDIT, bamFreebayesSingleNoIntervals, bamHaplotypeCallerNoIntervals, bamRecalAll) = bam_recalibrated.into(6) - - -// // To speed Variant Callers up we are chopping the reference into smaller pieces -// // Do variant calling by this intervals, and re-merge the VCFs -// bamFreebayesSingle = bamFreebayesSingleNoIntervals.spread(intFreebayesSingle) - - - - - - - - -// // STEP STRELKA.1 - SINGLE MODE - -// process StrelkaSingle { -// label 'cpus_max' -// label 'memory_max' - -// tag "${idSample}" - -// publishDir "${params.outdir}/VariantCalling/${idSample}/Strelka", mode: params.publish_dir_mode - -// input: -// set idPatient, idSample, file(bam), file(bai) from bamStrelkaSingle -// file(fasta) from fasta -// file(fastaFai) from fai -// file(targetBED) from ch_target_bed - -// output: -// set val("Strelka"), idPatient, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfStrelkaSingle - -// when: 'strelka' in tools - -// script: -// beforeScript = params.target_bed ? "bgzip --threads ${task.cpus} -c ${targetBED} > call_targets.bed.gz ; tabix call_targets.bed.gz" : "" -// options = params.target_bed ? "--exome --callRegions call_targets.bed.gz" : "" -// """ -// ${beforeScript} -// configureStrelkaGermlineWorkflow.py \ -// --bam ${bam} \ -// --referenceFasta ${fasta} \ -// ${options} \ -// --runDir Strelka - -// python Strelka/runWorkflow.py -m local -j ${task.cpus} - -// mv Strelka/results/variants/genome.*.vcf.gz \ -// Strelka_${idSample}_genome.vcf.gz -// mv Strelka/results/variants/genome.*.vcf.gz.tbi \ -// Strelka_${idSample}_genome.vcf.gz.tbi -// mv Strelka/results/variants/variants.vcf.gz \ -// Strelka_${idSample}_variants.vcf.gz -// mv Strelka/results/variants/variants.vcf.gz.tbi \ -// Strelka_${idSample}_variants.vcf.gz.tbi -// """ -// } - -// vcfStrelkaSingle = vcfStrelkaSingle.dump(tag:'Strelka - Single Mode') - // // STEP MANTA.1 - SINGLE MODE // process MantaSingle { diff --git a/modules/local/subworkflow/build_indices.nf b/modules/local/subworkflow/build_indices.nf index 589e6db947..3237650683 100644 --- a/modules/local/subworkflow/build_indices.nf +++ b/modules/local/subworkflow/build_indices.nf @@ -34,7 +34,7 @@ workflow BUILD_INDICES{ version_bwa = Channel.empty() if (!(params.bwa) && 'mapping' in step) if (params.aligner == "bwa-mem") (result_bwa, version_bwa) = BWA_INDEX(fasta, modules['bwa_index']) - else result_bwa = BWAMEM2_INDEX(fasta) + else result_bwa = BWAMEM2_INDEX(fasta, modules['bwamem2_index']) result_dict = Channel.empty() if (!(params.dict) && !('annotate' in step) && !('controlfreec' in step)) diff --git a/modules/local/subworkflow/mapping.nf b/modules/local/subworkflow/mapping.nf index b1396dc735..afe41428d8 100644 --- a/modules/local/subworkflow/mapping.nf +++ b/modules/local/subworkflow/mapping.nf @@ -105,13 +105,13 @@ workflow MAPPING { if (save_bam_mapped) { tsv_bam_mapped = bam_mapped.map { meta, bam -> [meta] } // Creating TSV files to restart from this step - tsv_bam_mapped.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> + tsv_bam_mapped.collectFile(storeDir: "${params.outdir}/preprocessing/tsv") { meta -> patient = meta.patient[0] sample = meta.sample[0] gender = meta.gender[0] status = meta.status[0] - bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam" - bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam.bai" + bam = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam" + bai = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam.bai" ["mapped_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n"] } @@ -120,10 +120,10 @@ workflow MAPPING { sample = meta.sample[0] gender = meta.gender[0] status = meta.status[0] - bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam" - bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam.bai" + bam = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam" + bai = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam.bai" "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n" - }.collectFile(name: "mapped.tsv", sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + }.collectFile(name: "mapped.tsv", sort: true, storeDir: "${params.outdir}/preprocessing/tsv") } } diff --git a/modules/local/subworkflow/markduplicates.nf b/modules/local/subworkflow/markduplicates.nf index ba7f787f30..47e550c960 100644 --- a/modules/local/subworkflow/markduplicates.nf +++ b/modules/local/subworkflow/markduplicates.nf @@ -24,15 +24,15 @@ workflow MARKDUPLICATES { tsv_markduplicates = GATK_MARKDUPLICATES.out.tsv // Creating TSV files to restart from this step - tsv_markduplicates.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> + tsv_markduplicates.collectFile(storeDir: "${params.outdir}/preprocessing/tsv") { meta -> patient = meta.patient sample = meta.sample gender = meta.gender status = meta.status - bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" - bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" - table = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.recal.table" - ["duplicates_marked_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n"] + bam = "${params.outdir}/preprocessing/${sample}/markduplicates/${sample}.md.bam" + bai = "${params.outdir}/preprocessing/${sample}/markduplicates/${sample}.md.bam.bai" + table = "${params.outdir}/preprocessing/${sample}/markduplicates/${sample}.recal.table" + ["markduplicates_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n"] } tsv_markduplicates.map { meta -> @@ -40,24 +40,24 @@ workflow MARKDUPLICATES { sample = meta.sample gender = meta.gender status = meta.status - bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" - bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" - table = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.recal.table" + bam = "${params.outdir}/preprocessing/${sample}/markduplicates/${sample}.md.bam" + bai = "${params.outdir}/preprocessing/${sample}/markduplicates/${sample}.md.bam.bai" + table = "${params.outdir}/preprocessing/${sample}/markduplicates/${sample}.recal.table" "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n" - }.collectFile(name: 'duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + }.collectFile(name: 'markduplicates.tsv', sort: true, storeDir: "${params.outdir}/preprocessing/tsv") } else { tsv_no_markduplicates = bam_markduplicates.map { meta, bam, bai -> [meta] } // Creating TSV files to restart from this step - tsv_no_markduplicates.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> + tsv_no_markduplicates.collectFile(storeDir: "${params.outdir}/preprocessing/tsv") { meta -> patient = meta.patient[0] sample = meta.sample[0] gender = meta.gender[0] status = meta.status[0] - bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam" - bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam.bai" - table = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.recal.table" - ["mapped_no_duplicates_marked_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n"] + bam = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam" + bai = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam.bai" + table = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.recal.table" + ["mapped_no_markduplicates_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n"] } tsv_no_markduplicates.map { meta -> @@ -65,11 +65,11 @@ workflow MARKDUPLICATES { sample = meta.sample[0] gender = meta.gender[0] status = meta.status[0] - bam = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam" - bai = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.bam.bai" - table = "${params.outdir}/Preprocessing/${sample}/Mapped/${sample}.recal.table" + bam = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam" + bai = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.bam.bai" + table = "${params.outdir}/preprocessing/${sample}/mapped/${sample}.recal.table" "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\t${table}\n" - }.collectFile(name: 'mapped_no_duplicates_marked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + }.collectFile(name: 'mapped_no_markduplicates.tsv', sort: true, storeDir: "${params.outdir}/preprocessing/tsv") } } diff --git a/modules/local/subworkflow/prepare_recalibration.nf b/modules/local/subworkflow/prepare_recalibration.nf index b56061503f..30817bc0be 100644 --- a/modules/local/subworkflow/prepare_recalibration.nf +++ b/modules/local/subworkflow/prepare_recalibration.nf @@ -62,14 +62,14 @@ workflow PREPARE_RECALIBRATION { } // Creating TSV files to restart from this step - tsv_bqsr.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> + tsv_bqsr.collectFile(storeDir: "${params.outdir}/preprocessing/tsv") { meta -> patient = meta.patient sample = meta.sample gender = meta.gender status = meta.status - bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" - bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" - ["duplicates_marked_no_table_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n"] + bam = "${params.outdir}/preprocessing/${sample}/markduplicates/${sample}.md.bam" + bai = "${params.outdir}/preprocessing/${sample}/markduplicates/${sample}.md.bam.bai" + ["markduplicates_no_table_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n"] } tsv_bqsr.map { meta -> @@ -77,10 +77,10 @@ workflow PREPARE_RECALIBRATION { sample = meta.sample gender = meta.gender status = meta.status - bam = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam" - bai = "${params.outdir}/Preprocessing/${sample}/DuplicatesMarked/${sample}.md.bam.bai" + bam = "${params.outdir}/preprocessing/${sample}/markduplicates/${sample}.md.bam" + bai = "${params.outdir}/preprocessing/${sample}/markduplicates/${sample}.md.bam.bai" "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n" - }.collectFile(name: 'duplicates_marked_no_table.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + }.collectFile(name: 'markduplicates_no_table.tsv', sort: true, storeDir: "${params.outdir}/preprocessing/tsv") } emit: diff --git a/modules/local/subworkflow/recalibrate.nf b/modules/local/subworkflow/recalibrate.nf index b42a50846d..f910120fd0 100644 --- a/modules/local/subworkflow/recalibrate.nf +++ b/modules/local/subworkflow/recalibrate.nf @@ -93,13 +93,13 @@ workflow RECALIBRATE { // // When starting with variant calling, Channel bam_recalibrated is input_sample // if (step == 'variantcalling') bam_recalibrated = input_sample // Creating TSV files to restart from this step - tsv_recalibrated.collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { meta -> + tsv_recalibrated.collectFile(storeDir: "${params.outdir}/preprocessing/tsv") { meta -> patient = meta.patient sample = meta.sample gender = meta.gender status = meta.status - bam = "${params.outdir}/Preprocessing/${sample}/Recalibrated/${sample}.md.bam" - bai = "${params.outdir}/Preprocessing/${sample}/Recalibrated/${sample}.md.bam.bai" + bam = "${params.outdir}/preprocessing/${sample}/recalibrated/${sample}.md.bam" + bai = "${params.outdir}/preprocessing/${sample}/recalibrated/${sample}.md.bam.bai" ["recalibrated_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n"] } @@ -108,10 +108,10 @@ workflow RECALIBRATE { sample = meta.sample gender = meta.gender status = meta.status - bam = "${params.outdir}/Preprocessing/${sample}/Recalibrated/${sample}.md.bam" - bai = "${params.outdir}/Preprocessing/${sample}/Recalibrated/${sample}.md.bam.bai" + bam = "${params.outdir}/preprocessing/${sample}/recalibrated/${sample}.md.bam" + bai = "${params.outdir}/preprocessing/${sample}/recalibrated/${sample}.md.bam.bai" "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n" - }.collectFile(name: 'recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV") + }.collectFile(name: 'recalibrated.tsv', sort: true, storeDir: "${params.outdir}/preprocessing/tsv") } emit: diff --git a/modules/nf-core/software/bwamem2_index.nf b/modules/nf-core/software/bwamem2_index.nf index 3b5e6c50fb..d7d7069abe 100644 --- a/modules/nf-core/software/bwamem2_index.nf +++ b/modules/nf-core/software/bwamem2_index.nf @@ -13,6 +13,7 @@ process BWAMEM2_INDEX { output: path "${fasta}.*" + val options script: """ diff --git a/modules/nf-core/software/gatk/markduplicates.nf b/modules/nf-core/software/gatk/markduplicates.nf index 4b79304d82..659750fa4c 100644 --- a/modules/nf-core/software/gatk/markduplicates.nf +++ b/modules/nf-core/software/gatk/markduplicates.nf @@ -4,8 +4,8 @@ process GATK_MARKDUPLICATES { publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { - if (it == "${meta.sample}.bam.metrics") "Reports/${meta.sample}/MarkDuplicates/${it}" - else "Preprocessing/${meta.sample}/DuplicatesMarked/${it}" + if (it == "${meta.sample}.bam.metrics") "Reports/${meta.sample}/markduplicates/${it}" + else "preprocessing/${meta.sample}/markduplicates/${it}" } container "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" From ffd6dd9f28122d83fabdd98f7a5fb13d008a69c3 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 18:04:37 +0200 Subject: [PATCH 178/200] fix: bwamem2 input channels --- modules/nf-core/software/bwamem2_index.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/nf-core/software/bwamem2_index.nf b/modules/nf-core/software/bwamem2_index.nf index d7d7069abe..7e0f86f723 100644 --- a/modules/nf-core/software/bwamem2_index.nf +++ b/modules/nf-core/software/bwamem2_index.nf @@ -4,16 +4,16 @@ process BWAMEM2_INDEX { publishDir params.outdir, mode: params.publish_dir_mode, saveAs: {params.save_reference ? "reference_genome/BWAIndex/${it}" : null } - input: - path fasta - container "quay.io/biocontainers/bwa-mem2:2.0--he513fc3_1" conda (params.conda ? "bioconda::bwa-mem2=2.0" : null) + input: + path fasta + val options + output: path "${fasta}.*" - val options script: """ From 9ec8fc51fcdda67886d2ddf555b0e689e124b328 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 18:23:42 +0200 Subject: [PATCH 179/200] use only one functions.nf file for nf-core modules --- .../nf-core/software/bwa/index/functions.nf | 59 ------------------- modules/nf-core/software/functions.nf | 6 +- 2 files changed, 3 insertions(+), 62 deletions(-) delete mode 100644 modules/nf-core/software/bwa/index/functions.nf diff --git a/modules/nf-core/software/bwa/index/functions.nf b/modules/nf-core/software/bwa/index/functions.nf deleted file mode 100644 index b3ac38015b..0000000000 --- a/modules/nf-core/software/bwa/index/functions.nf +++ /dev/null @@ -1,59 +0,0 @@ -/* - * ----------------------------------------------------- - * Utility functions used in nf-core DSL2 module files - * ----------------------------------------------------- - */ - -/* - * Extract name of software tool from process name using $task.process - */ -def getSoftwareName(task_process) { - return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() -} - -/* - * Function to initialise default values and to generate a Groovy Map of available options for nf-core modules - */ -def initOptions(Map args) { - def Map options = [:] - options.args = args.args ?: '' - options.args2 = args.args2 ?: '' - options.publish_by_id = args.publish_by_id ?: false - options.publish_dir = args.publish_dir ?: '' - options.publish_files = args.publish_files - options.suffix = args.suffix ?: '' - return options -} - -/* - * Tidy up and join elements of a list to return a path string - */ -def getPathFromList(path_list) { - def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries - paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes - return paths.join('/') -} - -/* - * Function to save/publish module results - */ -def saveFiles(Map args) { - if (!args.filename.endsWith('.version.txt')) { - def ioptions = initOptions(args.options) - def path_list = [ ioptions.publish_dir ?: args.publish_dir ] - if (ioptions.publish_by_id) { - path_list.add(args.publish_id) - } - if (ioptions.publish_files instanceof Map) { - for (ext in ioptions.publish_files) { - if (args.filename.endsWith(ext.key)) { - def ext_list = path_list.collect() - ext_list.add(ext.value) - return "${getPathFromList(ext_list)}/$args.filename" - } - } - } else if (ioptions.publish_files == null) { - return "${getPathFromList(path_list)}/$args.filename" - } - } -} diff --git a/modules/nf-core/software/functions.nf b/modules/nf-core/software/functions.nf index c284945c4b..d25eea86b3 100644 --- a/modules/nf-core/software/functions.nf +++ b/modules/nf-core/software/functions.nf @@ -20,7 +20,7 @@ def initOptions(Map args) { options.args2 = args.args2 ?: '' options.publish_by_id = args.publish_by_id ?: false options.publish_dir = args.publish_dir ?: '' - options.publish_files = args.publish_files ?: null + options.publish_files = args.publish_files options.suffix = args.suffix ?: '' return options } @@ -52,8 +52,8 @@ def saveFiles(Map args) { return "${getPathFromList(ext_list)}/$args.filename" } } - } else { + } else if (ioptions.publish_files == null) { return "${getPathFromList(path_list)}/$args.filename" } } -} \ No newline at end of file +} From d660683f34bd374806004b665642507691259bae Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 18:44:36 +0200 Subject: [PATCH 180/200] add publish_by_id --- conf/modules.config | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index 512a7a6dd6..c3a872b568 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -11,6 +11,7 @@ params { args2 = "" extra = "" suffix = "" + publish_by_id = "false" publish_dir = "genome/bwa_index" publish_results = "all" } @@ -19,6 +20,7 @@ params { args2 = "" extra = "" suffix = "" + publish_by_id = "false" publish_dir = "" publish_results = "none" } @@ -27,6 +29,7 @@ params { args2 = "" extra = "" suffix = "" + publish_by_id = "false" publish_dir = "genome/bwamem2_index" publish_results = "all" } @@ -35,6 +38,7 @@ params { args2 = "" extra = "" suffix = "" + publish_by_id = "false" publish_dir = "" publish_results = "none" } @@ -43,6 +47,7 @@ params { args2 = "" extra = "" suffix = "" + publish_by_id = "false" publish_dir = "haplotypecaller" publish_results = "all" } @@ -51,6 +56,7 @@ params { args2 = "" extra = "" suffix = ".g" + publish_by_id = "false" publish_dir = "haplotypecallergvcf" publish_results = "all" } @@ -59,6 +65,7 @@ params { args = "" extra = "" suffix = "" + publish_by_id = "false" publish_dir = "Reports/FastQC" publish_results = "all" } @@ -67,6 +74,7 @@ params { args2 = "" extra = "" suffix = ".md" + publish_by_id = "false" publish_dir = "bwa/mergedLibrary" publish_results = "all" } @@ -75,6 +83,7 @@ params { args2 = "" extra = "" suffix = "" + publish_by_id = "false" publish_dir = "pipeline_info" publish_results = "all" } @@ -83,6 +92,7 @@ params { args2 = "" extra = "" suffix = "" + publish_by_id = "false" publish_dir = "" publish_results = "none" } @@ -91,6 +101,7 @@ params { args2 = "" extra = "" suffix = "md" + publish_by_id = "false" publish_dir = "" publish_results = "none" } @@ -99,6 +110,7 @@ params { args2 = "" extra = "" suffix = "" + publish_by_id = "false" publish_dir = "" publish_dir_up = "preprocessing" publish_dir_down = "mapped" @@ -109,6 +121,7 @@ params { args2 = "" extra = "" suffix = "md" + publish_by_id = "false" publish_dir = "" publish_dir_up = "preprocessing" publish_dir_down = "recalibrated" @@ -119,6 +132,7 @@ params { args2 = "" extra = "" suffix = "" + publish_by_id = "false" publish_dir = "strelka" publish_results = "all" } @@ -127,6 +141,7 @@ params { args2 = "" extra = "" suffix = "" + publish_by_id = "false" publish_dir = "trimgalore" publish_results = "all" } From 1f96f2220114f51c167eaa6ef93983ab6347e73e Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 18:45:58 +0200 Subject: [PATCH 181/200] update modules with new syntax + code polishing --- modules/nf-core/software/bwa/index/main.nf | 11 ++++++----- modules/nf-core/software/bwamem2_index.nf | 13 ++++++++++--- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/modules/nf-core/software/bwa/index/main.nf b/modules/nf-core/software/bwa/index/main.nf index c951811e2a..9a7e5928c9 100644 --- a/modules/nf-core/software/bwa/index/main.nf +++ b/modules/nf-core/software/bwa/index/main.nf @@ -1,10 +1,11 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' +include { initOptions; saveFiles; getSoftwareName } from './../../functions' process BWA_INDEX { - tag "$fasta" + tag "${fasta}" + label 'process_high' - publishDir "${params.outdir}", + + publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:'') } @@ -24,7 +25,7 @@ process BWA_INDEX { def software = getSoftwareName(task.process) def ioptions = initOptions(options) """ - bwa index $ioptions.args $fasta + bwa index ${ioptions.args} ${fasta} echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//' > ${software}.version.txt """ } diff --git a/modules/nf-core/software/bwamem2_index.nf b/modules/nf-core/software/bwamem2_index.nf index 7e0f86f723..256f6284f7 100644 --- a/modules/nf-core/software/bwamem2_index.nf +++ b/modules/nf-core/software/bwamem2_index.nf @@ -1,8 +1,11 @@ +include { initOptions; saveFiles; getSoftwareName } from './functions' + process BWAMEM2_INDEX { tag "${fasta}" - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: {params.save_reference ? "reference_genome/BWAIndex/${it}" : null } + publishDir params.outdir, + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:'') } container "quay.io/biocontainers/bwa-mem2:2.0--he513fc3_1" @@ -16,7 +19,11 @@ process BWAMEM2_INDEX { path "${fasta}.*" script: + def software = getSoftwareName(task.process) + def ioptions = initOptions(options) """ - bwa-mem2 index ${fasta} + bwa-mem2 index ${ioptions.args} ${fasta} + + echo \$(bwa-mem2 version 2>&1) > ${software}.version.txt """ } From 04e0f96bf0ec1649a160b13152c862350489d734 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 19:07:45 +0200 Subject: [PATCH 182/200] update module with new syntax --- conf/modules.config | 11 ++++++++++- .../nf-core/software/gatk/createsequencedictionary.nf | 9 ++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index c3a872b568..a2dba3723b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -69,13 +69,22 @@ params { publish_dir = "Reports/FastQC" publish_results = "all" } + 'gatk_createsequencedictionary' { + args = "" + args2 = "" + extra = "" + suffix = "" + publish_by_id = "false" + publish_dir = "dictionnary" + publish_results = "all" + } 'gatk_markduplicates' { args = "ASSUME_SORTED=true REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT TMP_DIR=tmp" args2 = "" extra = "" suffix = ".md" publish_by_id = "false" - publish_dir = "bwa/mergedLibrary" + publish_dir = "" publish_results = "all" } 'get_software_versions' { diff --git a/modules/nf-core/software/gatk/createsequencedictionary.nf b/modules/nf-core/software/gatk/createsequencedictionary.nf index ed902725b1..b2e19b1bc1 100644 --- a/modules/nf-core/software/gatk/createsequencedictionary.nf +++ b/modules/nf-core/software/gatk/createsequencedictionary.nf @@ -1,8 +1,10 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + process GATK_CREATESEQUENCEDICTIONARY { tag "${fasta}" publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: {params.save_reference ? "reference_genome/${it}" : null } + saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:'') } container "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" @@ -10,15 +12,20 @@ process GATK_CREATESEQUENCEDICTIONARY { input: path fasta + val options output: path "${fasta.baseName}.dict" script: + def software = getSoftwareName(task.process) + def ioptions = initOptions(options) """ gatk --java-options "-Xmx${task.memory.toGiga()}g" \ CreateSequenceDictionary \ --REFERENCE ${fasta} \ --OUTPUT ${fasta.baseName}.dict + + echo \$(gatk CreateSequenceDictionary --version 2>&1) | sed 's/^.*The Genome Analysis Toolkit (GATK) v//; s/ HTSJDK.*\$//' > ${software}.version.txt """ } \ No newline at end of file From 245d9311157c12051e16670e590d95ea029bd81b Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 7 Oct 2020 19:17:12 +0200 Subject: [PATCH 183/200] update module with new syntax --- conf/modules.config | 15 ++++++++++++--- modules/nf-core/software/samtools/faidx.nf | 9 ++++++++- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index a2dba3723b..93a369fc07 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -12,7 +12,7 @@ params { extra = "" suffix = "" publish_by_id = "false" - publish_dir = "genome/bwa_index" + publish_dir = "reference/bwa" publish_results = "all" } 'bwa_mem' { @@ -30,7 +30,7 @@ params { extra = "" suffix = "" publish_by_id = "false" - publish_dir = "genome/bwamem2_index" + publish_dir = "reference/bwamem2" publish_results = "all" } 'bwamem2_mem' { @@ -75,7 +75,7 @@ params { extra = "" suffix = "" publish_by_id = "false" - publish_dir = "dictionnary" + publish_dir = "reference" publish_results = "all" } 'gatk_markduplicates' { @@ -114,6 +114,15 @@ params { publish_dir = "" publish_results = "none" } + 'samtools_faidx' { + args = "" + args2 = "" + extra = "" + suffix = "" + publish_by_id = "false" + publish_dir = "reference" + publish_results = "none" + } 'samtools_index_mapping' { args = "" args2 = "" diff --git a/modules/nf-core/software/samtools/faidx.nf b/modules/nf-core/software/samtools/faidx.nf index 152d32cf5f..94fb04e0f6 100644 --- a/modules/nf-core/software/samtools/faidx.nf +++ b/modules/nf-core/software/samtools/faidx.nf @@ -1,8 +1,10 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + process SAMTOOLS_FAIDX { tag "${fasta}" publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: {params.save_reference ? "reference_genome/${it}" : null } + saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:'') } container "quay.io/biocontainers/samtools:1.10--h2e538c0_3" @@ -10,12 +12,17 @@ process SAMTOOLS_FAIDX { input: path fasta + val options output: path "${fasta}.fai" script: + def software = getSoftwareName(task.process) + def ioptions = initOptions(options) """ samtools faidx ${fasta} + + echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/ Using.*\$//' > ${software}.version.txt """ } From 1a7466f4587224f6ad2197d0254b088ae07ff98c Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 8 Oct 2020 13:36:16 +0200 Subject: [PATCH 184/200] use new module syntax --- conf/modules.config | 9 +++++++++ modules/local/subworkflow/build_indices.nf | 12 ++++++------ .../software/gatk/createsequencedictionary.nf | 2 +- modules/nf-core/software/htslib_tabix.nf | 10 ++++++++++ 4 files changed, 26 insertions(+), 7 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 93a369fc07..28cd1fb5f8 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -96,6 +96,15 @@ params { publish_dir = "pipeline_info" publish_results = "all" } + 'htslib_tabix' { + args = "" + args2 = "" + extra = "" + suffix = "" + publish_by_id = "false" + publish_dir = "reference" + publish_results = "all" + } 'merge_bam_mapping' { args = "" args2 = "" diff --git a/modules/local/subworkflow/build_indices.nf b/modules/local/subworkflow/build_indices.nf index 3237650683..ac48b9eb76 100644 --- a/modules/local/subworkflow/build_indices.nf +++ b/modules/local/subworkflow/build_indices.nf @@ -38,27 +38,27 @@ workflow BUILD_INDICES{ result_dict = Channel.empty() if (!(params.dict) && !('annotate' in step) && !('controlfreec' in step)) - result_dict = GATK_DICT(fasta) + result_dict = GATK_DICT(fasta, modules['gatk_createsequencedictionary']) result_fai = Channel.empty() if (!(params.fasta_fai) && !('annotate' in step)) - result_fai = SAMTOOLS_FAIDX(fasta) + result_fai = SAMTOOLS_FAIDX(fasta, modules['samtools_faidx']) result_dbsnp_tbi = Channel.empty() if (!(params.dbsnp_index) && params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || 'tnscope' in tools)) - result_dbsnp_tbi = TABIX_DBSNP(dbsnp) + result_dbsnp_tbi = TABIX_DBSNP(dbsnp, modules['htslib_tabix']) result_germline_resource_tbi = Channel.empty() if (!(params.germline_resource_index) && params.germline_resource && 'mutect2' in tools) - result_germline_resource_tbi = TABIX_GERMLINE_RESOURCE(germline_resource) + result_germline_resource_tbi = TABIX_GERMLINE_RESOURCE(germline_resource, modules['htslib_tabix']) result_known_indels_tbi = Channel.empty() if (!(params.known_indels_index) && params.known_indels && ('mapping' in step || 'preparerecalibration' in step)) - result_known_indels_tbi = TABIX_KNOWN_INDELS(known_indels) + result_known_indels_tbi = TABIX_KNOWN_INDELS(known_indels, modules['htslib_tabix']) result_pon_tbi = Channel.empty() if (!(params.pon_index) && params.pon && ('tnscope' in tools || 'mutect2' in tools)) - result_pon_tbi = TABIX_PON(pon) + result_pon_tbi = TABIX_PON(pon, modules['htslib_tabix']) if (params.no_intervals) { file("${params.outdir}/no_intervals.bed").text = "no_intervals\n" diff --git a/modules/nf-core/software/gatk/createsequencedictionary.nf b/modules/nf-core/software/gatk/createsequencedictionary.nf index b2e19b1bc1..e00292dc4d 100644 --- a/modules/nf-core/software/gatk/createsequencedictionary.nf +++ b/modules/nf-core/software/gatk/createsequencedictionary.nf @@ -26,6 +26,6 @@ process GATK_CREATESEQUENCEDICTIONARY { --REFERENCE ${fasta} \ --OUTPUT ${fasta.baseName}.dict - echo \$(gatk CreateSequenceDictionary --version 2>&1) | sed 's/^.*The Genome Analysis Toolkit (GATK) v//; s/ HTSJDK.*\$//' > ${software}.version.txt + echo \$(gatk CreateSequenceDictionary --version 2>&1) | sed 's/^.*(GATK) v//; s/ HTSJDK.*\$//' > ${software}.version.txt """ } \ No newline at end of file diff --git a/modules/nf-core/software/htslib_tabix.nf b/modules/nf-core/software/htslib_tabix.nf index 9dca431034..e8ff2a6cc5 100644 --- a/modules/nf-core/software/htslib_tabix.nf +++ b/modules/nf-core/software/htslib_tabix.nf @@ -1,18 +1,28 @@ +include { initOptions; saveFiles; getSoftwareName } from './functions' + process HTSLIB_TABIX { tag "${vcf}" + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:'') } + container 'quay.io/biocontainers/tabix:0.2.6--ha92aebf_0' conda (params.conda ? "bioconda::tabix=0.2.6" : null) input: path vcf + val options output: path "${vcf}.tbi" script: + def software = getSoftwareName(task.process) + def ioptions = initOptions(options) """ tabix -p vcf ${vcf} + + echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/(.*\$//' > ${software}.version.txt """ } From 436e1830a225dd941155b93ed1efc05d38947720 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 8 Oct 2020 13:37:44 +0200 Subject: [PATCH 185/200] update modules --- modules/local/process/build_intervals.nf | 2 ++ modules/local/process/bwa_mem.nf | 3 ++- modules/local/process/bwamem2_mem.nf | 2 ++ modules/local/process/concat_vcf.nf | 2 ++ modules/local/process/create_intervals_bed.nf | 1 + modules/local/process/merge_bam.nf | 2 ++ modules/nf-core/software/bwa/index/main.nf | 3 +-- modules/nf-core/software/bwamem2_index.nf | 3 +-- modules/nf-core/software/fastqc.nf | 2 ++ modules/nf-core/software/gatk/applybqsr.nf | 2 ++ modules/nf-core/software/gatk/baserecalibrator.nf | 2 ++ modules/nf-core/software/gatk/gatherbqsrreports.nf | 6 ++++-- modules/nf-core/software/gatk/genotypegvcf.nf | 2 ++ modules/nf-core/software/gatk/haplotypecaller.nf | 2 ++ modules/nf-core/software/gatk/markduplicates.nf | 2 ++ modules/nf-core/software/multiqc.nf | 2 ++ modules/nf-core/software/qualimap_bamqc.nf | 2 ++ modules/nf-core/software/samtools/index.nf | 2 ++ modules/nf-core/software/samtools/stats.nf | 2 ++ modules/nf-core/software/strelka/germline.nf | 1 - modules/nf-core/software/trimgalore.nf | 2 ++ 21 files changed, 39 insertions(+), 8 deletions(-) diff --git a/modules/local/process/build_intervals.nf b/modules/local/process/build_intervals.nf index 65d3cfb600..f915014c46 100644 --- a/modules/local/process/build_intervals.nf +++ b/modules/local/process/build_intervals.nf @@ -1,3 +1,5 @@ +include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' + process BUILD_INTERVALS { tag "${fai}" diff --git a/modules/local/process/bwa_mem.nf b/modules/local/process/bwa_mem.nf index d821d5cf0d..161003e37e 100644 --- a/modules/local/process/bwa_mem.nf +++ b/modules/local/process/bwa_mem.nf @@ -1,3 +1,5 @@ +include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' + process BWA_MEM { tag "${meta.id}" @@ -10,7 +12,6 @@ process BWA_MEM { else filename } container "quay.io/biocontainers/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:eabfac3657eda5818bae4090db989e3d41b01542-0" - //container "https://depot.galaxyproject.org/singularity/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:eabfac3657eda5818bae4090db989e3d41b01542-0" conda (params.conda ? "bioconda::bwa=0.7.17 bioconda::samtools=1.10" : null) diff --git a/modules/local/process/bwamem2_mem.nf b/modules/local/process/bwamem2_mem.nf index db67534832..d7b1acd17d 100644 --- a/modules/local/process/bwamem2_mem.nf +++ b/modules/local/process/bwamem2_mem.nf @@ -1,3 +1,5 @@ +include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' + process BWAMEM2_MEM { tag "${meta.id}" diff --git a/modules/local/process/concat_vcf.nf b/modules/local/process/concat_vcf.nf index d0b4e4b8cd..b43b74d8a4 100644 --- a/modules/local/process/concat_vcf.nf +++ b/modules/local/process/concat_vcf.nf @@ -1,3 +1,5 @@ +include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' + process CONCAT_VCF { label 'cpus_8' diff --git a/modules/local/process/create_intervals_bed.nf b/modules/local/process/create_intervals_bed.nf index 05fc396be4..08c5360b28 100644 --- a/modules/local/process/create_intervals_bed.nf +++ b/modules/local/process/create_intervals_bed.nf @@ -1,3 +1,4 @@ +include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' include { has_extension } from '../functions' process CREATE_INTERVALS_BED { diff --git a/modules/local/process/merge_bam.nf b/modules/local/process/merge_bam.nf index 3c952084e4..5155d2302c 100644 --- a/modules/local/process/merge_bam.nf +++ b/modules/local/process/merge_bam.nf @@ -1,3 +1,5 @@ +include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' + process MERGE_BAM { label 'cpus_8' diff --git a/modules/nf-core/software/bwa/index/main.nf b/modules/nf-core/software/bwa/index/main.nf index 9a7e5928c9..c4d0a10004 100644 --- a/modules/nf-core/software/bwa/index/main.nf +++ b/modules/nf-core/software/bwa/index/main.nf @@ -5,8 +5,7 @@ process BWA_INDEX { label 'process_high' - publishDir params.outdir, - mode: params.publish_dir_mode, + publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:'') } container "quay.io/biocontainers/bwa:0.7.17--hed695b0_7" diff --git a/modules/nf-core/software/bwamem2_index.nf b/modules/nf-core/software/bwamem2_index.nf index 256f6284f7..92c6b81b33 100644 --- a/modules/nf-core/software/bwamem2_index.nf +++ b/modules/nf-core/software/bwamem2_index.nf @@ -3,8 +3,7 @@ include { initOptions; saveFiles; getSoftwareName } from './functions' process BWAMEM2_INDEX { tag "${fasta}" - publishDir params.outdir, - mode: params.publish_dir_mode, + publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:'') } container "quay.io/biocontainers/bwa-mem2:2.0--he513fc3_1" diff --git a/modules/nf-core/software/fastqc.nf b/modules/nf-core/software/fastqc.nf index b9d1c091c3..0b17201ba3 100644 --- a/modules/nf-core/software/fastqc.nf +++ b/modules/nf-core/software/fastqc.nf @@ -1,3 +1,5 @@ +include { initOptions; saveFiles; getSoftwareName } from './functions' + process FASTQC { tag "${meta.id}" label 'process_medium' diff --git a/modules/nf-core/software/gatk/applybqsr.nf b/modules/nf-core/software/gatk/applybqsr.nf index 45e9927fac..1af93fd943 100644 --- a/modules/nf-core/software/gatk/applybqsr.nf +++ b/modules/nf-core/software/gatk/applybqsr.nf @@ -1,3 +1,5 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + process GATK_APPLYBQSR { label 'memory_singleCPU_2_task' label 'cpus_2' diff --git a/modules/nf-core/software/gatk/baserecalibrator.nf b/modules/nf-core/software/gatk/baserecalibrator.nf index 84fe403354..ebae85a38f 100644 --- a/modules/nf-core/software/gatk/baserecalibrator.nf +++ b/modules/nf-core/software/gatk/baserecalibrator.nf @@ -1,3 +1,5 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + process GATK_BASERECALIBRATOR { label 'cpus_1' diff --git a/modules/nf-core/software/gatk/gatherbqsrreports.nf b/modules/nf-core/software/gatk/gatherbqsrreports.nf index 7de5005abb..9be208457c 100644 --- a/modules/nf-core/software/gatk/gatherbqsrreports.nf +++ b/modules/nf-core/software/gatk/gatherbqsrreports.nf @@ -1,3 +1,5 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + process GATK_GATHERBQSRREPORTS { label 'memory_singleCPU_2_task' label 'cpus_2' @@ -5,8 +7,8 @@ process GATK_GATHERBQSRREPORTS { publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { - if (it == "${meta.sample}.recal.table" && !params.skip_markduplicates) "Preprocessing/${meta.sample}/DuplicatesMarked/${it}" - else "Preprocessing/${meta.sample}/Mapped/${it}" + if (it == "${meta.sample}.recal.table" && !params.skip_markduplicates) "preprocessing/${meta.sample}/markduplicates/${it}" + else "preprocessing/${meta.sample}/mapped/${it}" } container "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" diff --git a/modules/nf-core/software/gatk/genotypegvcf.nf b/modules/nf-core/software/gatk/genotypegvcf.nf index 9c7533746c..1967c27579 100644 --- a/modules/nf-core/software/gatk/genotypegvcf.nf +++ b/modules/nf-core/software/gatk/genotypegvcf.nf @@ -1,3 +1,5 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + process GATK_GENOTYPEGVCF { tag "${meta.id}-${interval.baseName}" diff --git a/modules/nf-core/software/gatk/haplotypecaller.nf b/modules/nf-core/software/gatk/haplotypecaller.nf index 7ee415a432..22f034994d 100644 --- a/modules/nf-core/software/gatk/haplotypecaller.nf +++ b/modules/nf-core/software/gatk/haplotypecaller.nf @@ -1,3 +1,5 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + process GATK_HAPLOTYPECALLER { label 'MEMORY_SINGLECPU_TASK_SQ' label 'CPUS_2' diff --git a/modules/nf-core/software/gatk/markduplicates.nf b/modules/nf-core/software/gatk/markduplicates.nf index 659750fa4c..168a524e3a 100644 --- a/modules/nf-core/software/gatk/markduplicates.nf +++ b/modules/nf-core/software/gatk/markduplicates.nf @@ -1,3 +1,5 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + process GATK_MARKDUPLICATES { label 'cpus_16' tag "${meta.id}" diff --git a/modules/nf-core/software/multiqc.nf b/modules/nf-core/software/multiqc.nf index 439a68c14e..78b0157091 100644 --- a/modules/nf-core/software/multiqc.nf +++ b/modules/nf-core/software/multiqc.nf @@ -1,3 +1,5 @@ +include { initOptions; saveFiles; getSoftwareName } from './functions' + // Has the run name been specified by the user? // this has the bonus effect of catching both -name and --name custom_runName = params.name diff --git a/modules/nf-core/software/qualimap_bamqc.nf b/modules/nf-core/software/qualimap_bamqc.nf index 76df692d2d..dc1d0a69c1 100644 --- a/modules/nf-core/software/qualimap_bamqc.nf +++ b/modules/nf-core/software/qualimap_bamqc.nf @@ -1,3 +1,5 @@ +include { initOptions; saveFiles; getSoftwareName } from './functions' + process QUALIMAP_BAMQC { label 'memory_max' label 'cpus_16' diff --git a/modules/nf-core/software/samtools/index.nf b/modules/nf-core/software/samtools/index.nf index 9dcbe5c524..5b7f8f1a32 100644 --- a/modules/nf-core/software/samtools/index.nf +++ b/modules/nf-core/software/samtools/index.nf @@ -1,3 +1,5 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + process SAMTOOLS_INDEX { label 'cpus_8' diff --git a/modules/nf-core/software/samtools/stats.nf b/modules/nf-core/software/samtools/stats.nf index 6dfb321efa..49cced37cf 100644 --- a/modules/nf-core/software/samtools/stats.nf +++ b/modules/nf-core/software/samtools/stats.nf @@ -1,3 +1,5 @@ +include { initOptions; saveFiles; getSoftwareName } from './../functions' + process SAMTOOLS_STATS { label 'cpus_2' diff --git a/modules/nf-core/software/strelka/germline.nf b/modules/nf-core/software/strelka/germline.nf index 8640cf57f7..c93bbe5e87 100644 --- a/modules/nf-core/software/strelka/germline.nf +++ b/modules/nf-core/software/strelka/germline.nf @@ -1,4 +1,3 @@ -// Import generic module functions include { initOptions; saveFiles; getSoftwareName } from './../functions' process STRELKA_GERMLINE { diff --git a/modules/nf-core/software/trimgalore.nf b/modules/nf-core/software/trimgalore.nf index 440d00d808..23b2553651 100644 --- a/modules/nf-core/software/trimgalore.nf +++ b/modules/nf-core/software/trimgalore.nf @@ -1,3 +1,5 @@ +include { initOptions; saveFiles; getSoftwareName } from './functions' + process TRIMGALORE { tag "${meta.id}" label 'process_high' From a0d3e469512a380c1577763af3ada77ded98a1cf Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 8 Oct 2020 13:55:36 +0200 Subject: [PATCH 186/200] remove downloading of docker images --- .github/workflows/ci.yml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 54a8b97a8a..640128c317 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -62,8 +62,6 @@ jobs: env: # Only check Nextflow pipeline minimum version NXF_VER: '20.07.1' - - name: Pull docker image - run: docker pull nfcore/sarek:dsl2 - name: Run ${{ matrix.profile }} test run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --aligner ${{ matrix.aligner }} @@ -83,8 +81,6 @@ jobs: env: # Only check Nextflow pipeline minimum version NXF_VER: '20.07.1' - - name: Pull docker image - run: docker pull nfcore/sarek:dsl2 - name: Get test data run: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data - name: Run germline test --step mapping @@ -115,7 +111,6 @@ jobs: # NXF_VER: '20.07.1' # - name: Pull docker image # run: | - # docker pull nfcore/sarek:dsl2 # docker pull nfcore/sarek${{ matrix.tools }}:dev.${{ matrix.species }} # - name: Run annotation test # run: nextflow run ${GITHUB_WORKSPACE} -profile test_annotation,docker --tools ${{ matrix.tools }} @@ -137,8 +132,6 @@ jobs: # env: # # Only check Nextflow pipeline minimum version # NXF_VER: '20.07.1' - # - name: Pull docker image - # run: docker pull nfcore/sarek:dsl2 # - name: Run test for minimal genomes # run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --skipQC all --genome ${{ matrix.genome }} ${{ matrix.intervals }} --tools Manta,mpileup,Strelka,FreeBayes @@ -158,8 +151,6 @@ jobs: # env: # # Only check Nextflow pipeline minimum version # NXF_VER: '20.07.1' - # - name: Pull docker image - # run: docker pull nfcore/sarek:dsl2 # - name: Run ${{ matrix.profile }} test # run: nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.profile }},docker @@ -190,7 +181,5 @@ jobs: env: # Only check Nextflow pipeline minimum version NXF_VER: '20.07.1' - - name: Pull docker image - run: docker pull nfcore/sarek:dsl2 - name: Run ${{ matrix.tool }} test run: nextflow run ${GITHUB_WORKSPACE} -profile test_tool,docker --tools ${{ matrix.tool }} ${{ matrix.intervals }} From 6df3ac1b330234e0199f7755466f0d3a5e015a0b Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 8 Oct 2020 13:59:44 +0200 Subject: [PATCH 187/200] better tests --- .github/workflows/ci.yml | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 640128c317..4e3385947f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,20 +24,6 @@ jobs: steps: - name: Check out pipeline code uses: actions/checkout@v2 - - name: Check if Dockerfile or Conda environment changed - uses: technote-space/get-diff-action@v1 - with: - PREFIX_FILTER: | - Dockerfile - environment.yml - - name: Build new docker image - if: env.GIT_DIFF - run: docker build --no-cache . -t nfcore/sarek:dsl2 - - name: Pull docker image - if: ${{ !env.GIT_DIFF }} - run: | - docker pull nfcore/sarek:dsl2 - docker tag nfcore/sarek:dsl2 nfcore/sarek:dsl2 - name: Install Nextflow run: | wget -qO- get.nextflow.io | bash @@ -47,6 +33,7 @@ jobs: nextflow run ${GITHUB_WORKSPACE} -profile test,docker aligner: + name: Run aligner tests env: NXF_ANSI_LOG: false runs-on: ubuntu-latest @@ -66,6 +53,7 @@ jobs: run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker --aligner ${{ matrix.aligner }} germline: + name: Run input from a folder test and restart from step tests env: NXF_ANSI_LOG: false runs-on: ubuntu-latest @@ -155,6 +143,7 @@ jobs: # run: nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.profile }},docker tools: + name: Run tool tests env: NXF_ANSI_LOG: false runs-on: ubuntu-latest From 7e8ebf33c26531ef598aec6d9567ad3d721f259d Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 8 Oct 2020 15:08:51 +0200 Subject: [PATCH 188/200] trying to fix issue with docker on a specific container --- conf/base.config | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/conf/base.config b/conf/base.config index 94ca2c2a12..f656b44efb 100644 --- a/conf/base.config +++ b/conf/base.config @@ -52,6 +52,10 @@ process { cache = false } + withName:CREATE_INTERVALS_BED { + docker.runOptions = '-u \$(id -u):\$(id -g)' + } + withName:CONCAT_VCF { // For unknown reasons, CONCATVCF sometimes fails with SIGPIPE // (exit code 141). Rerunning the process will usually work. From 2cc09a5bd11e27733cf22d80468b3581ccfbd8c7 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 8 Oct 2020 15:23:43 +0200 Subject: [PATCH 189/200] change container --- conf/base.config | 4 ---- modules/local/process/build_intervals.nf | 4 ++-- modules/local/process/create_intervals_bed.nf | 4 ++-- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/conf/base.config b/conf/base.config index f656b44efb..94ca2c2a12 100644 --- a/conf/base.config +++ b/conf/base.config @@ -52,10 +52,6 @@ process { cache = false } - withName:CREATE_INTERVALS_BED { - docker.runOptions = '-u \$(id -u):\$(id -g)' - } - withName:CONCAT_VCF { // For unknown reasons, CONCATVCF sometimes fails with SIGPIPE // (exit code 141). Rerunning the process will usually work. diff --git a/modules/local/process/build_intervals.nf b/modules/local/process/build_intervals.nf index f915014c46..c718ae32b5 100644 --- a/modules/local/process/build_intervals.nf +++ b/modules/local/process/build_intervals.nf @@ -6,9 +6,9 @@ process BUILD_INTERVALS { publishDir params.outdir, mode: params.publish_dir_mode, saveAs: {params.save_reference ? "reference_genome/${it}" : null } - container "biocontainers/biocontainers:v1.2.0_cv1" + container "quay.io/biocontainers/gawk:5.1.0" - conda (params.conda ? "conda-forge::sed=4.7" : null) + conda (params.conda ? "anaconda::gawk=5.1.0" : null) input: path fai diff --git a/modules/local/process/create_intervals_bed.nf b/modules/local/process/create_intervals_bed.nf index 08c5360b28..2193ae0929 100644 --- a/modules/local/process/create_intervals_bed.nf +++ b/modules/local/process/create_intervals_bed.nf @@ -4,9 +4,9 @@ include { has_extension } from '../functions' process CREATE_INTERVALS_BED { tag "${intervals}" - container "biocontainers/biocontainers:v1.2.0_cv1" + container "quay.io/biocontainers/gawk:5.1.0" - conda (params.conda ? "conda-forge::sed=4.7" : null) + conda (params.conda ? "anaconda::gawk=5.1.0" : null) input: path intervals From fe244f19d6cf9eff9fcc68d378da11798e626128 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 13 Oct 2020 18:01:45 +0200 Subject: [PATCH 190/200] use new syntax to define docker/singularity container and/or conda env --- main.nf | 6 +-- modules/local/process/build_intervals.nf | 11 +++-- modules/local/process/bwa_mem.nf | 9 ++-- modules/local/process/bwamem2_mem.nf | 9 ++-- modules/local/process/concat_vcf.nf | 9 ++-- modules/local/process/create_intervals_bed.nf | 9 ++-- .../local/process/get_software_versions.nf | 42 ------------------- modules/local/process/merge_bam.nf | 9 ++-- modules/nf-core/software/bwa/index/main.nf | 9 ++-- modules/nf-core/software/bwamem2_index.nf | 9 ++-- modules/nf-core/software/fastqc.nf | 9 ++-- modules/nf-core/software/gatk/applybqsr.nf | 9 ++-- .../nf-core/software/gatk/baserecalibrator.nf | 9 ++-- .../software/gatk/createsequencedictionary.nf | 9 ++-- .../software/gatk/gatherbqsrreports.nf | 9 ++-- modules/nf-core/software/gatk/genotypegvcf.nf | 9 ++-- .../nf-core/software/gatk/haplotypecaller.nf | 9 ++-- .../nf-core/software/gatk/markduplicates.nf | 9 ++-- modules/nf-core/software/htslib_tabix.nf | 9 ++-- modules/nf-core/software/multiqc.nf | 13 +++--- modules/nf-core/software/qualimap_bamqc.nf | 9 ++-- modules/nf-core/software/samtools/faidx.nf | 9 ++-- modules/nf-core/software/samtools/index.nf | 9 ++-- modules/nf-core/software/samtools/stats.nf | 9 ++-- modules/nf-core/software/strelka/germline.nf | 9 ++-- modules/nf-core/software/trimgalore.nf | 9 ++-- 26 files changed, 149 insertions(+), 121 deletions(-) delete mode 100644 modules/local/process/get_software_versions.nf diff --git a/main.nf b/main.nf index 6b6e6525a2..7abbb6f476 100644 --- a/main.nf +++ b/main.nf @@ -256,8 +256,6 @@ if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works ================================================================================ */ -include { GET_SOFTWARE_VERSIONS } from './modules/local/process/get_software_versions' - /* ================================================================================ INCLUDE LOCAL SUBWORKFLOWS @@ -505,10 +503,10 @@ workflow { ================================================================================ */ - GET_SOFTWARE_VERSIONS() + // GET_SOFTWARE_VERSIONS() MULTIQC( - GET_SOFTWARE_VERSIONS.out.yml, + // GET_SOFTWARE_VERSIONS.out.yml, multiqc_config, multiqc_custom_config.ifEmpty([]), workflow_summary, diff --git a/modules/local/process/build_intervals.nf b/modules/local/process/build_intervals.nf index c718ae32b5..fd0c7df1a6 100644 --- a/modules/local/process/build_intervals.nf +++ b/modules/local/process/build_intervals.nf @@ -1,14 +1,17 @@ include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' +environment = params.conda ? "anaconda::gawk=5.1.0" : null +container = "quay.io/biocontainers/gawk:5.1.0" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/gawk:5.1.0" + process BUILD_INTERVALS { - tag "${fai}" + tag fai publishDir params.outdir, mode: params.publish_dir_mode, saveAs: {params.save_reference ? "reference_genome/${it}" : null } - container "quay.io/biocontainers/gawk:5.1.0" - - conda (params.conda ? "anaconda::gawk=5.1.0" : null) + conda environment + container container input: path fai diff --git a/modules/local/process/bwa_mem.nf b/modules/local/process/bwa_mem.nf index 161003e37e..2206173399 100644 --- a/modules/local/process/bwa_mem.nf +++ b/modules/local/process/bwa_mem.nf @@ -1,5 +1,9 @@ include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' +environment = params.conda ? "bioconda::bwa=0.7.17 bioconda::samtools=1.10" : null +container = "quay.io/biocontainers/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:eabfac3657eda5818bae4090db989e3d41b01542-0" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:eabfac3657eda5818bae4090db989e3d41b01542-0" + process BWA_MEM { tag "${meta.id}" @@ -11,9 +15,8 @@ process BWA_MEM { if (filename.endsWith('.version.txt')) null else filename } - container "quay.io/biocontainers/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:eabfac3657eda5818bae4090db989e3d41b01542-0" - - conda (params.conda ? "bioconda::bwa=0.7.17 bioconda::samtools=1.10" : null) + conda environment + container container input: tuple val(meta), path(reads) diff --git a/modules/local/process/bwamem2_mem.nf b/modules/local/process/bwamem2_mem.nf index d7b1acd17d..d6f01ff770 100644 --- a/modules/local/process/bwamem2_mem.nf +++ b/modules/local/process/bwamem2_mem.nf @@ -1,5 +1,9 @@ include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' +environment = params.conda ? "bioconda::bwa-mem2=2.0 bioconda::samtools=1.10" : null +container = "quay.io/biocontainers/mulled-v2-e5d375990341c5aef3c9aff74f96f66f65375ef6:876eb6f1d38fbf578296ea94e5aede4e317939e7-0" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/mulled-v2-e5d375990341c5aef3c9aff74f96f66f65375ef6:876eb6f1d38fbf578296ea94e5aede4e317939e7-0" + process BWAMEM2_MEM { tag "${meta.id}" @@ -11,9 +15,8 @@ process BWAMEM2_MEM { if (filename.endsWith('.version.txt')) null else filename } - container "quay.io/biocontainers/mulled-v2-e5d375990341c5aef3c9aff74f96f66f65375ef6:876eb6f1d38fbf578296ea94e5aede4e317939e7-0" - - conda (params.conda ? "bioconda::bwa-mem2=2.0 bioconda::samtools=1.10" : null) + conda environment + container container input: tuple val(meta), path(reads) diff --git a/modules/local/process/concat_vcf.nf b/modules/local/process/concat_vcf.nf index b43b74d8a4..b44abcd5c9 100644 --- a/modules/local/process/concat_vcf.nf +++ b/modules/local/process/concat_vcf.nf @@ -1,5 +1,9 @@ include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' +environment = params.conda ? "bioconda::htslib=1.11" : null +container = "quay.io/biocontainers/htslib:1.11--hd3b49d5_0" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/htslib:1.11--hd3b49d5_0" + process CONCAT_VCF { label 'cpus_8' @@ -7,9 +11,8 @@ process CONCAT_VCF { publishDir "${params.outdir}/VariantCalling/${meta.id}/${options.publish_dir}", mode: params.publish_dir_mode - container "quay.io/biocontainers/htslib:1.11--hd3b49d5_0" - - conda (params.conda ? "bioconda::htslib=1.11" : null) + conda environment + container container input: tuple val(meta), path(vcf) diff --git a/modules/local/process/create_intervals_bed.nf b/modules/local/process/create_intervals_bed.nf index 2193ae0929..6d4470612e 100644 --- a/modules/local/process/create_intervals_bed.nf +++ b/modules/local/process/create_intervals_bed.nf @@ -1,12 +1,15 @@ include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' include { has_extension } from '../functions' +environment = params.conda ? "anaconda::gawk=5.1.0" : null +container = "quay.io/biocontainers/gawk:5.1.0" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/gawk:5.1.0" + process CREATE_INTERVALS_BED { tag "${intervals}" - container "quay.io/biocontainers/gawk:5.1.0" - - conda (params.conda ? "anaconda::gawk=5.1.0" : null) + conda environment + container container input: path intervals diff --git a/modules/local/process/get_software_versions.nf b/modules/local/process/get_software_versions.nf deleted file mode 100644 index 124d0d9884..0000000000 --- a/modules/local/process/get_software_versions.nf +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Parse software version numbers - */ -process GET_SOFTWARE_VERSIONS { - publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode, - saveAs: { filename -> - if (filename.indexOf(".csv") > 0) filename - else null - } - - output: - path "software_versions_mqc.yaml", emit: yml - path "software_versions.csv", emit: csv - - script: - """ - // alleleCounter --version &> v_allelecount.txt 2>&1 || true - // bcftools --version &> v_bcftools.txt 2>&1 || true - // bwa-mem2 version &> v_bwamem2.txt 2>&1 || true - // cnvkit.py version &> v_cnvkit.txt 2>&1 || true - // configManta.py --version &> v_manta.txt 2>&1 || true - // configureStrelkaGermlineWorkflow.py --version &> v_strelka.txt 2>&1 || true - // echo "${workflow.manifest.version}" &> v_pipeline.txt 2>&1 || true - // echo "${workflow.nextflow.version}" &> v_nextflow.txt 2>&1 || true - snpEff -version &> v_snpeff.txt 2>&1 || true - // fastqc --version &> v_fastqc.txt 2>&1 || true - // freebayes --version &> v_freebayes.txt 2>&1 || true - // freec &> v_controlfreec.txt 2>&1 || true - // gatk ApplyBQSR --help &> v_gatk.txt 2>&1 || true - // msisensor &> v_msisensor.txt 2>&1 || true - // multiqc --version &> v_multiqc.txt 2>&1 || true - // qualimap --version &> v_qualimap.txt 2>&1 || true - // R --version &> v_r.txt 2>&1 || true - // R -e "library(ASCAT); help(package='ASCAT')" &> v_ascat.txt 2>&1 || true - samtools --version &> v_samtools.txt 2>&1 || true - // tiddit &> v_tiddit.txt 2>&1 || true - // trim_galore -v &> v_trim_galore.txt 2>&1 || true - // vcftools --version &> v_vcftools.txt 2>&1 || true - // vep --help &> v_vep.txt 2>&1 || true - scrape_software_versions.py &> software_versions_mqc.yaml - """ -} diff --git a/modules/local/process/merge_bam.nf b/modules/local/process/merge_bam.nf index 5155d2302c..58069773c8 100644 --- a/modules/local/process/merge_bam.nf +++ b/modules/local/process/merge_bam.nf @@ -1,13 +1,16 @@ include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' +environment = params.conda ? "bioconda::samtools=1.10" : null +container = "quay.io/biocontainers/samtools:1.10--h2e538c0_3" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/samtools:1.10--h2e538c0_3" + process MERGE_BAM { label 'cpus_8' tag "${meta.id}" - container "quay.io/biocontainers/samtools:1.10--h2e538c0_3" - - conda (params.conda ? "bioconda::samtools=1.10" : null) + conda environment + container container input: tuple val(meta), path(bam) diff --git a/modules/nf-core/software/bwa/index/main.nf b/modules/nf-core/software/bwa/index/main.nf index c4d0a10004..869ee420c0 100644 --- a/modules/nf-core/software/bwa/index/main.nf +++ b/modules/nf-core/software/bwa/index/main.nf @@ -1,5 +1,9 @@ include { initOptions; saveFiles; getSoftwareName } from './../../functions' +environment = params.conda ? "bioconda::bwa=0.7.17" : null +container = "quay.io/biocontainers/bwa:0.7.17--hed695b0_7" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/bwa:0.7.17--hed695b0_7" + process BWA_INDEX { tag "${fasta}" @@ -8,9 +12,8 @@ process BWA_INDEX { publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:'') } - container "quay.io/biocontainers/bwa:0.7.17--hed695b0_7" - - conda (params.conda ? "bioconda::bwa=0.7.17" : null) + conda environment + container container input: path fasta diff --git a/modules/nf-core/software/bwamem2_index.nf b/modules/nf-core/software/bwamem2_index.nf index 92c6b81b33..4e49ff773a 100644 --- a/modules/nf-core/software/bwamem2_index.nf +++ b/modules/nf-core/software/bwamem2_index.nf @@ -1,14 +1,17 @@ include { initOptions; saveFiles; getSoftwareName } from './functions' +environment = params.conda ? "bioconda::bwa-mem2=2.0" : null +container = "quay.io/biocontainers/bwa-mem2:2.0--he513fc3_1" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/bwa-mem2:2.0--he513fc3_1" + process BWAMEM2_INDEX { tag "${fasta}" publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:'') } - container "quay.io/biocontainers/bwa-mem2:2.0--he513fc3_1" - - conda (params.conda ? "bioconda::bwa-mem2=2.0" : null) + conda environment + container container input: path fasta diff --git a/modules/nf-core/software/fastqc.nf b/modules/nf-core/software/fastqc.nf index 0b17201ba3..3205de93db 100644 --- a/modules/nf-core/software/fastqc.nf +++ b/modules/nf-core/software/fastqc.nf @@ -1,5 +1,9 @@ include { initOptions; saveFiles; getSoftwareName } from './functions' +environment = params.conda ? "bioconda::fastqc=0.11.9" : null +container = "quay.io/biocontainers/fastqc:0.11.9--0" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0" + process FASTQC { tag "${meta.id}" label 'process_medium' @@ -12,9 +16,8 @@ process FASTQC { else if (filename.endsWith('.version.txt')) null else filename } - container "quay.io/biocontainers/fastqc:0.11.9--0" - - conda (params.conda ? "bioconda::fastqc=0.11.9" : null) + conda environment + container container input: tuple val(meta), path(reads) diff --git a/modules/nf-core/software/gatk/applybqsr.nf b/modules/nf-core/software/gatk/applybqsr.nf index 1af93fd943..df12da4336 100644 --- a/modules/nf-core/software/gatk/applybqsr.nf +++ b/modules/nf-core/software/gatk/applybqsr.nf @@ -1,14 +1,17 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' +environment = params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null +container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" + process GATK_APPLYBQSR { label 'memory_singleCPU_2_task' label 'cpus_2' tag "${meta.id}-${interval.baseName}" - container "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" - - conda (params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null) + conda environment + container container input: tuple val(meta), path(bam), path(bai), path(recalibrationReport), path(interval) diff --git a/modules/nf-core/software/gatk/baserecalibrator.nf b/modules/nf-core/software/gatk/baserecalibrator.nf index ebae85a38f..fb780f095b 100644 --- a/modules/nf-core/software/gatk/baserecalibrator.nf +++ b/modules/nf-core/software/gatk/baserecalibrator.nf @@ -1,13 +1,16 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' +environment = params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null +container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" + process GATK_BASERECALIBRATOR { label 'cpus_1' tag "${meta.id}-${interval.baseName}" - container "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" - - conda (params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null) + conda environment + container container input: tuple val(meta), path(bam), path(bai), path(interval) diff --git a/modules/nf-core/software/gatk/createsequencedictionary.nf b/modules/nf-core/software/gatk/createsequencedictionary.nf index e00292dc4d..b22f2c6e6e 100644 --- a/modules/nf-core/software/gatk/createsequencedictionary.nf +++ b/modules/nf-core/software/gatk/createsequencedictionary.nf @@ -1,14 +1,17 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' +environment = params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null +container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" + process GATK_CREATESEQUENCEDICTIONARY { tag "${fasta}" publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:'') } - container "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" - - conda (params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null) + conda environment + container container input: path fasta diff --git a/modules/nf-core/software/gatk/gatherbqsrreports.nf b/modules/nf-core/software/gatk/gatherbqsrreports.nf index 9be208457c..65c07fb5e2 100644 --- a/modules/nf-core/software/gatk/gatherbqsrreports.nf +++ b/modules/nf-core/software/gatk/gatherbqsrreports.nf @@ -1,5 +1,9 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' +environment = params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null +container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" + process GATK_GATHERBQSRREPORTS { label 'memory_singleCPU_2_task' label 'cpus_2' @@ -11,9 +15,8 @@ process GATK_GATHERBQSRREPORTS { else "preprocessing/${meta.sample}/mapped/${it}" } - container "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" - - conda (params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null) + conda environment + container container input: tuple val(meta), path(recal) diff --git a/modules/nf-core/software/gatk/genotypegvcf.nf b/modules/nf-core/software/gatk/genotypegvcf.nf index 1967c27579..cc418cfbf4 100644 --- a/modules/nf-core/software/gatk/genotypegvcf.nf +++ b/modules/nf-core/software/gatk/genotypegvcf.nf @@ -1,11 +1,14 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' +environment = params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null +container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" + process GATK_GENOTYPEGVCF { tag "${meta.id}-${interval.baseName}" - container "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" - - conda (params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null) + conda environment + container container input: tuple val(meta), path(interval), path(gvcf) diff --git a/modules/nf-core/software/gatk/haplotypecaller.nf b/modules/nf-core/software/gatk/haplotypecaller.nf index 22f034994d..ee0186ba13 100644 --- a/modules/nf-core/software/gatk/haplotypecaller.nf +++ b/modules/nf-core/software/gatk/haplotypecaller.nf @@ -1,14 +1,17 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' +environment = params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null +container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" + process GATK_HAPLOTYPECALLER { label 'MEMORY_SINGLECPU_TASK_SQ' label 'CPUS_2' tag "${meta.id}-${interval.baseName}" - container "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" - - conda (params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null) + conda environment + container container input: tuple val(meta), path(bam), path(bai), file(interval) diff --git a/modules/nf-core/software/gatk/markduplicates.nf b/modules/nf-core/software/gatk/markduplicates.nf index 168a524e3a..9f457a534a 100644 --- a/modules/nf-core/software/gatk/markduplicates.nf +++ b/modules/nf-core/software/gatk/markduplicates.nf @@ -1,5 +1,9 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' +environment = params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null +container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" + process GATK_MARKDUPLICATES { label 'cpus_16' tag "${meta.id}" @@ -10,9 +14,8 @@ process GATK_MARKDUPLICATES { else "preprocessing/${meta.sample}/markduplicates/${it}" } - container "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" - - conda (params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null) + conda environment + container container input: tuple val(meta), path("${meta.sample}.bam"), path("${meta.sample}.bam.bai") diff --git a/modules/nf-core/software/htslib_tabix.nf b/modules/nf-core/software/htslib_tabix.nf index e8ff2a6cc5..ceaeba3f5d 100644 --- a/modules/nf-core/software/htslib_tabix.nf +++ b/modules/nf-core/software/htslib_tabix.nf @@ -1,14 +1,17 @@ include { initOptions; saveFiles; getSoftwareName } from './functions' +environment = params.conda ? "bioconda::tabix=0.2.6" : null +container = "quay.io/biocontainers/tabix:0.2.6--ha92aebf_0" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/tabix:0.2.6--ha92aebf_0" + process HTSLIB_TABIX { tag "${vcf}" publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:'') } - container 'quay.io/biocontainers/tabix:0.2.6--ha92aebf_0' - - conda (params.conda ? "bioconda::tabix=0.2.6" : null) + conda environment + container container input: path vcf diff --git a/modules/nf-core/software/multiqc.nf b/modules/nf-core/software/multiqc.nf index 78b0157091..33c931ad99 100644 --- a/modules/nf-core/software/multiqc.nf +++ b/modules/nf-core/software/multiqc.nf @@ -1,8 +1,12 @@ include { initOptions; saveFiles; getSoftwareName } from './functions' +environment = params.conda ? "bioconda::qualimap=2.2.2d" : null +container = "quay.io/biocontainers/multiqc=1.9" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/multiqc:1.9--py_1" + // Has the run name been specified by the user? // this has the bonus effect of catching both -name and --name -custom_runName = params.name +def custom_runName = params.name if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { custom_runName = workflow.runName } @@ -10,12 +14,11 @@ if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { process MULTIQC { publishDir "${params.outdir}/multiqc", mode: params.publish_dir_mode - container "quay.io/biocontainers/multiqc:1.9--py_1" - - conda (params.conda ? "bioconda::multiqc=1.9" : null) + conda environment + container container input: - path software_versions + // path software_versions path multiqc_config path multiqc_custom_config val workflow_summary diff --git a/modules/nf-core/software/qualimap_bamqc.nf b/modules/nf-core/software/qualimap_bamqc.nf index dc1d0a69c1..515dc22cf6 100644 --- a/modules/nf-core/software/qualimap_bamqc.nf +++ b/modules/nf-core/software/qualimap_bamqc.nf @@ -1,5 +1,9 @@ include { initOptions; saveFiles; getSoftwareName } from './functions' +environment = params.conda ? "bioconda::qualimap=2.2.2d" : null +container = "quay.io/biocontainers/qualimap:2.2.2d--1" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/qualimap:2.2.2d--1" + process QUALIMAP_BAMQC { label 'memory_max' label 'cpus_16' @@ -8,9 +12,8 @@ process QUALIMAP_BAMQC { publishDir "${params.outdir}/Reports/${meta.id}/bamQC", mode: params.publish_dir_mode - container "quay.io/biocontainers/qualimap:2.2.2d--1" - - conda (params.conda ? "bioconda::qualimap=2.2.2d" : null) + conda environment + container container input: tuple val(meta), path(bam) diff --git a/modules/nf-core/software/samtools/faidx.nf b/modules/nf-core/software/samtools/faidx.nf index 94fb04e0f6..893c4ec5a9 100644 --- a/modules/nf-core/software/samtools/faidx.nf +++ b/modules/nf-core/software/samtools/faidx.nf @@ -1,14 +1,17 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' +environment = params.conda ? "bioconda::samtools=1.10" : null +container = "quay.io/biocontainers/samtools:1.10--h2e538c0_3" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/samtools:1.10--h2e538c0_3" + process SAMTOOLS_FAIDX { tag "${fasta}" publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:'') } - container "quay.io/biocontainers/samtools:1.10--h2e538c0_3" - - conda (params.conda ? "bioconda::samtools=1.10" : null) + conda environment + container container input: path fasta diff --git a/modules/nf-core/software/samtools/index.nf b/modules/nf-core/software/samtools/index.nf index 5b7f8f1a32..4b042bb1dd 100644 --- a/modules/nf-core/software/samtools/index.nf +++ b/modules/nf-core/software/samtools/index.nf @@ -1,5 +1,9 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' +environment = params.conda ? "bioconda::samtools=1.10" : null +container = "quay.io/biocontainers/samtools:1.10--h2e538c0_3" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/samtools:1.10--h2e538c0_3" + process SAMTOOLS_INDEX { label 'cpus_8' @@ -11,9 +15,8 @@ process SAMTOOLS_INDEX { else if (filename.endsWith('.version.txt')) null else "${options.publish_dir_up}/${meta.sample}/${options.publish_dir_down}/${filename}" } - container "quay.io/biocontainers/samtools:1.10--h2e538c0_3" - - conda (params.conda ? "bioconda::samtools=1.10" : null) + conda environment + container container input: tuple val(meta), path(bam) diff --git a/modules/nf-core/software/samtools/stats.nf b/modules/nf-core/software/samtools/stats.nf index 49cced37cf..6138d07a05 100644 --- a/modules/nf-core/software/samtools/stats.nf +++ b/modules/nf-core/software/samtools/stats.nf @@ -1,5 +1,9 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' +environment = params.conda ? "bioconda::samtools=1.10" : null +container = "quay.io/biocontainers/samtools:1.10--h2e538c0_3" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/samtools:1.10--h2e538c0_3" + process SAMTOOLS_STATS { label 'cpus_2' @@ -7,9 +11,8 @@ process SAMTOOLS_STATS { publishDir "${params.outdir}/Reports/${meta.id}/SamToolsStats", mode: params.publish_dir_mode - container "quay.io/biocontainers/samtools:1.10--h2e538c0_3" - - conda (params.conda ? "bioconda::samtools=1.10" : null) + conda environment + container container input: tuple val(meta), path(bam) diff --git a/modules/nf-core/software/strelka/germline.nf b/modules/nf-core/software/strelka/germline.nf index c93bbe5e87..6ae56fc59a 100644 --- a/modules/nf-core/software/strelka/germline.nf +++ b/modules/nf-core/software/strelka/germline.nf @@ -1,5 +1,9 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' +environment = params.conda ? "bioconda::strelka=2.9.10" : null +container = "quay.io/biocontainers/strelka:2.9.10--0" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/strelka:2.9.10--0" + process STRELKA_GERMLINE { tag "$meta.id" @@ -10,9 +14,8 @@ process STRELKA_GERMLINE { mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } - container "quay.io/biocontainers/strelka:2.9.10--0" - - conda (params.conda ? "bioconda::strelka=2.9.10" : null) + conda environment + container container input: tuple val(meta), path(bam), path (bai) diff --git a/modules/nf-core/software/trimgalore.nf b/modules/nf-core/software/trimgalore.nf index 23b2553651..432ccfde08 100644 --- a/modules/nf-core/software/trimgalore.nf +++ b/modules/nf-core/software/trimgalore.nf @@ -1,5 +1,9 @@ include { initOptions; saveFiles; getSoftwareName } from './functions' +environment = params.conda ? "bioconda::trim-galore=0.6.5" : null +container = "quay.io/biocontainers/trim-galore:0.6.5--0" +if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/trim-galore:0.6.5--0" + process TRIMGALORE { tag "${meta.id}" label 'process_high' @@ -11,9 +15,8 @@ process TRIMGALORE { else if (filename.endsWith('.version.txt')) null else filename } - container "quay.io/biocontainers/trim-galore:0.6.5--0" - - conda (params.conda ? "trim-galore=0.6.5" : null) + conda environment + container container input: tuple val(meta), path(reads) From 2078c0337d65059b259f6b41ac2f5aec49f0948c Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 14 Oct 2020 09:49:50 +0200 Subject: [PATCH 191/200] fix: typo --- modules/nf-core/software/multiqc.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/nf-core/software/multiqc.nf b/modules/nf-core/software/multiqc.nf index 33c931ad99..a8868d74d7 100644 --- a/modules/nf-core/software/multiqc.nf +++ b/modules/nf-core/software/multiqc.nf @@ -1,7 +1,7 @@ include { initOptions; saveFiles; getSoftwareName } from './functions' -environment = params.conda ? "bioconda::qualimap=2.2.2d" : null -container = "quay.io/biocontainers/multiqc=1.9" +environment = params.conda ? "bioconda::multiqc=1.9" : null +container = "quay.io/biocontainers/multiqc:1.9--py_1" if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/multiqc:1.9--py_1" // Has the run name been specified by the user? From c145da27ad66fe62ab089bc8c3eca1af696bfab7 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 15 Oct 2020 13:22:50 +0200 Subject: [PATCH 192/200] chores: update to new modules syntax --- conf/modules.config | 206 +++++++++++++++--- main.nf | 65 ++++-- modules/local/process/build_intervals.nf | 12 +- modules/local/process/bwa_mem.nf | 17 +- modules/local/process/bwamem2_mem.nf | 8 +- modules/local/process/concat_vcf.nf | 8 +- modules/local/process/create_intervals_bed.nf | 7 +- modules/local/process/merge_bam.nf | 8 +- modules/local/subworkflow/build_indices.nf | 52 +++-- .../subworkflow/germline_variant_calling.nf | 26 ++- modules/local/subworkflow/mapping.nf | 28 ++- modules/local/subworkflow/markduplicates.nf | 4 +- .../subworkflow/prepare_recalibration.nf | 7 +- modules/local/subworkflow/recalibrate.nf | 21 +- modules/nf-core/software/bwa/index/main.nf | 8 +- modules/nf-core/software/bwamem2_index.nf | 8 +- modules/nf-core/software/fastqc.nf | 8 +- modules/nf-core/software/gatk/applybqsr.nf | 7 +- .../nf-core/software/gatk/baserecalibrator.nf | 7 +- .../software/gatk/createsequencedictionary.nf | 8 +- .../software/gatk/gatherbqsrreports.nf | 7 +- modules/nf-core/software/gatk/genotypegvcf.nf | 7 +- .../nf-core/software/gatk/haplotypecaller.nf | 7 +- .../nf-core/software/gatk/markduplicates.nf | 7 +- modules/nf-core/software/htslib_tabix.nf | 10 +- modules/nf-core/software/multiqc.nf | 7 +- modules/nf-core/software/qualimap_bamqc.nf | 7 +- modules/nf-core/software/samtools/faidx.nf | 8 +- modules/nf-core/software/samtools/index.nf | 8 +- modules/nf-core/software/samtools/stats.nf | 7 +- modules/nf-core/software/strelka/germline.nf | 24 +- modules/nf-core/software/trimgalore.nf | 8 +- modules/nf-core/subworkflow/qc_trim.nf | 11 +- nextflow.config | 10 +- 34 files changed, 449 insertions(+), 194 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 28cd1fb5f8..6664a5deff 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -6,7 +6,8 @@ params { modules { - 'bwa_index' { +// BUILD_INDICES + 'build_intervals' { args = "" args2 = "" extra = "" @@ -15,14 +16,14 @@ params { publish_dir = "reference/bwa" publish_results = "all" } - 'bwa_mem' { - args = "-K 100000000 -M" + 'bwa_index' { + args = "" args2 = "" extra = "" suffix = "" publish_by_id = "false" - publish_dir = "" - publish_results = "none" + publish_dir = "reference/bwa" + publish_results = "all" } 'bwamem2_index' { args = "" @@ -33,43 +34,43 @@ params { publish_dir = "reference/bwamem2" publish_results = "all" } - 'bwamem2_mem' { - args = "-K 100000000 -M" + 'create_intervals_bed' { + args = "" args2 = "" extra = "" suffix = "" publish_by_id = "false" - publish_dir = "" - publish_results = "none" + publish_dir = "reference" + publish_results = "all" } - 'concat_vcf_haplotypecaller' { + 'gatk_dict' { args = "" args2 = "" extra = "" suffix = "" publish_by_id = "false" - publish_dir = "haplotypecaller" + publish_dir = "reference" publish_results = "all" } - 'concat_vcf_haplotypecallergvcf' { + 'samtools_faidx' { args = "" args2 = "" extra = "" - suffix = ".g" + suffix = "" publish_by_id = "false" - publish_dir = "haplotypecallergvcf" - publish_results = "all" + publish_dir = "reference" + publish_results = "none" } - 'fastqc' { - args = "--quiet" + 'tabix_dbsnp' { args = "" + args2 = "" extra = "" suffix = "" publish_by_id = "false" - publish_dir = "Reports/FastQC" + publish_dir = "reference" publish_results = "all" } - 'gatk_createsequencedictionary' { + 'tabix_germline_resource' { args = "" args2 = "" extra = "" @@ -78,35 +79,36 @@ params { publish_dir = "reference" publish_results = "all" } - 'gatk_markduplicates' { - args = "ASSUME_SORTED=true REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT TMP_DIR=tmp" + 'tabix_known_indels' { + args = "" args2 = "" extra = "" - suffix = ".md" + suffix = "" publish_by_id = "false" - publish_dir = "" + publish_dir = "reference" publish_results = "all" } - 'get_software_versions' { + 'tabix_pon' { args = "" args2 = "" extra = "" suffix = "" publish_by_id = "false" - publish_dir = "pipeline_info" + publish_dir = "reference" publish_results = "all" } - 'htslib_tabix' { - args = "" +// MAPPING + 'bwa_mem1_mem' { + args = "-K 100000000 -M" args2 = "" extra = "" suffix = "" publish_by_id = "false" - publish_dir = "reference" - publish_results = "all" + publish_dir = "" + publish_results = "none" } - 'merge_bam_mapping' { - args = "" + 'bwa_mem2_mem' { + args = "-K 100000000 -M" args2 = "" extra = "" suffix = "" @@ -114,23 +116,23 @@ params { publish_dir = "" publish_results = "none" } - 'merge_bam_recalibrate' { + 'merge_bam_mapping' { args = "" args2 = "" extra = "" - suffix = "md" + suffix = "" publish_by_id = "false" publish_dir = "" publish_results = "none" } - 'samtools_faidx' { + 'qualimap_bamqc_mapping' { args = "" args2 = "" extra = "" suffix = "" publish_by_id = "false" - publish_dir = "reference" - publish_results = "none" + publish_dir = "qualimap" + publish_results = "all" } 'samtools_index_mapping' { args = "" @@ -143,6 +145,72 @@ params { publish_dir_down = "mapped" publish_results = "none" } + 'samtools_stats_mapping' { + args = "" + args2 = "" + extra = "" + suffix = "" + publish_by_id = "false" + publish_dir = "samtools_stats" + publish_results = "none" + } +// MARKDUPLICATES + 'markduplicates' { + args = "ASSUME_SORTED=true REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT TMP_DIR=tmp" + args2 = "" + extra = "" + suffix = ".md" + publish_by_id = "false" + publish_dir = "" + publish_results = "all" + } +// PREPARE_RECALIBRATION + 'baserecalibrator' { + args = "" + args2 = "" + extra = "" + suffix = "" + publish_by_id = "false" + publish_dir = "haplotypecaller" + publish_results = "all" + } + 'gatherbqsrreports' { + args = "" + args2 = "" + extra = "" + suffix = "" + publish_by_id = "false" + publish_dir = "haplotypecaller" + publish_results = "all" + } +// RECALIBRATE + 'applybqsr' { + args = "" + args2 = "" + extra = "" + suffix = "md" + publish_by_id = "false" + publish_dir = "" + publish_results = "none" + } + 'merge_bam_recalibrate' { + args = "" + args2 = "" + extra = "" + suffix = "md" + publish_by_id = "false" + publish_dir = "" + publish_results = "none" + } + 'qualimap_bamqc_recalibrate' { + args = "" + args2 = "" + extra = "" + suffix = "" + publish_by_id = "false" + publish_dir = "qualimap" + publish_results = "all" + } 'samtools_index_recalibrate' { args = "" args2 = "" @@ -154,6 +222,52 @@ params { publish_dir_down = "recalibrated" publish_results = "all" } + 'samtools_stats_recalibrate' { + args = "" + args2 = "" + extra = "" + suffix = "" + publish_by_id = "false" + publish_dir = "samtools_stats" + publish_results = "none" + } +// GERMLINE_VARIANT_CALLING + 'haplotypecaller' { + args = "" + args2 = "" + extra = "" + suffix = "" + publish_by_id = "false" + publish_dir = "haplotypecaller" + publish_results = "none" + } + 'genotypegvcf' { + args = "" + args2 = "" + extra = "" + suffix = "" + publish_by_id = "false" + publish_dir = "haplotypecallergvcf" + publish_results = "none" + } + 'concat_haplotypecaller' { + args = "" + args2 = "" + extra = "" + suffix = "" + publish_by_id = "false" + publish_dir = "haplotypecaller" + publish_results = "all" + } + 'concat_gvcf' { + args = "" + args2 = "" + extra = "" + suffix = ".g" + publish_by_id = "false" + publish_dir = "haplotypecallergvcf" + publish_results = "all" + } 'strelka_germline' { args = "" args2 = "" @@ -163,6 +277,16 @@ params { publish_dir = "strelka" publish_results = "all" } +// QC_TRIM + 'fastqc' { + args = "--quiet" + args = "" + extra = "" + suffix = "" + publish_by_id = "false" + publish_dir = "Reports/FastQC" + publish_results = "all" + } 'trimgalore' { args = "--fastqc" args2 = "" @@ -172,5 +296,15 @@ params { publish_dir = "trimgalore" publish_results = "all" } +// OTHERS + 'get_software_versions' { + args = "" + args2 = "" + extra = "" + suffix = "" + publish_by_id = "false" + publish_dir = "pipeline_info" + publish_results = "all" + } } } diff --git a/main.nf b/main.nf index 7abbb6f476..305edfe6ca 100644 --- a/main.nf +++ b/main.nf @@ -171,7 +171,8 @@ if (tsv_path) { ================================================================================ */ -modules = params.modules +modules = params.modules.clone() +if (save_bam_mapped) modules['samtools_index_mapping'].publish_results = "all" // Initialize each params in params.genomes, catch the command line first if it was defined params.ac_loci = params.genome ? params.genomes[params.genome].ac_loci ?: false : false @@ -262,12 +263,47 @@ if (params.sentieon) log.warn "[nf-core/sarek] Sentieon will be used, only works ================================================================================ */ -include { BUILD_INDICES } from './modules/local/subworkflow/build_indices' -include { MAPPING } from './modules/local/subworkflow/mapping' -include { MARKDUPLICATES } from './modules/local/subworkflow/markduplicates' -include { PREPARE_RECALIBRATION } from './modules/local/subworkflow/prepare_recalibration' -include { RECALIBRATE } from './modules/local/subworkflow/recalibrate' -include { GERMLINE_VARIANT_CALLING } from './modules/local/subworkflow/germline_variant_calling' +include { BUILD_INDICES } from './modules/local/subworkflow/build_indices' addParams( + build_intervals_options: modules['build_intervals'], + bwa_index_options: modules['bwa_index'], + bwamem2_index_options: modules['bwamem2_index'], + create_intervals_bed_options: modules['create_intervals_bed'], + gatk_dict_options: modules['gatk_dict'], + samtools_faidx_options: modules['samtools_faidx'], + tabix_dbsnp_options: modules['tabix_dbsnp'], + tabix_germline_resource_options: modules['tabix_germline_resource'], + tabix_known_indels_options: modules['tabix_known_indels'], + tabix_pon_options: modules['tabix_pon'] +) +include { MAPPING } from './modules/local/subworkflow/mapping' addParams( + bwamem1_mem_options: modules['bwa_mem1_mem'], + bwamem2_mem_options: modules['bwa_mem2_mem'], + merge_bam_options: modules['merge_bam_mapping'], + qualimap_bamqc_options: modules['qualimap_bamqc_mapping'], + samtools_index_options: modules['samtools_index_mapping'], + samtools_stats_options: modules['samtools_stats_mapping'] +) +include { MARKDUPLICATES } from './modules/local/subworkflow/markduplicates' addParams( + markduplicates_options: modules['markduplicates'] +) +include { PREPARE_RECALIBRATION } from './modules/local/subworkflow/prepare_recalibration' addParams( + baserecalibrator_options: modules['baserecalibrator'], + gatherbqsrreports_options: modules['gatherbqsrreports'] +) +include { RECALIBRATE } from './modules/local/subworkflow/recalibrate' addParams( + applybqsr_options: modules['applybqsr'], + merge_bam_options: modules['merge_bam_recalibrate'], + qualimap_bamqc_options: modules['qualimap_bamqc_recalibrate'], + samtools_index_options: modules['samtools_index_recalibrate'], + samtools_stats_options: modules['samtools_stats_recalibrate'] +) +include { GERMLINE_VARIANT_CALLING } from './modules/local/subworkflow/germline_variant_calling' addParams( + haplotypecaller_options: modules['haplotypecaller'], + genotypegvcf_options: modules['genotypegvcf'], + concat_gvcf_options: modules['concat_gvcf'], + concat_haplotypecaller_options: modules['concat_haplotypecaller'], + strelka_options: modules['strelka_germline'] +) /* ================================================================================ @@ -283,8 +319,10 @@ include { MULTIQC } from './modules/nf-core/software/multi ================================================================================ */ -include { QC_TRIM } from './modules/nf-core/subworkflow/qc_trim' - +include { QC_TRIM } from './modules/nf-core/subworkflow/qc_trim' addParams( + fastqc_options: modules['fastqc'], + trimgalore_options: modules['trimgalore'] +) // PREPARING CHANNELS FOR PREPROCESSING AND QC // input_bam = Channel.empty() @@ -345,7 +383,6 @@ workflow { fasta, germline_resource, known_indels, - modules, pon, step, tools) @@ -381,8 +418,7 @@ workflow { QC_TRIM( input_sample, ('fastqc' in skip_qc || step != "mapping"), - !(params.trim_fastq), - modules) + !(params.trim_fastq)) reads_input = QC_TRIM.out.reads @@ -395,15 +431,12 @@ workflow { // STEP 1: MAPPING READS TO REFERENCE GENOME WITH BWA-MEM - if (save_bam_mapped) modules['samtools_index_mapping'].publish_results = "all" - MAPPING( ('bamqc' in skip_qc), ('samtools' in skip_qc), bwa, fai, fasta, - modules, reads_input, save_bam_mapped, step, @@ -453,7 +486,6 @@ workflow { fai, fasta, intervals, - modules, step, target_bed) @@ -480,7 +512,6 @@ workflow { fai, fasta, intervals, - modules, target_bed, tools) diff --git a/modules/local/process/build_intervals.nf b/modules/local/process/build_intervals.nf index fd0c7df1a6..993db4670e 100644 --- a/modules/local/process/build_intervals.nf +++ b/modules/local/process/build_intervals.nf @@ -1,14 +1,18 @@ +// Import generic module functions include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' -environment = params.conda ? "anaconda::gawk=5.1.0" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "anaconda::gawk=5.1.0" : null container = "quay.io/biocontainers/gawk:5.1.0" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/gawk:5.1.0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/gawk:5.1.0" process BUILD_INTERVALS { tag fai - publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: {params.save_reference ? "reference_genome/${it}" : null } + publishDir "${params.outdir}", mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } conda environment container container diff --git a/modules/local/process/bwa_mem.nf b/modules/local/process/bwa_mem.nf index 2206173399..dd654d10f7 100644 --- a/modules/local/process/bwa_mem.nf +++ b/modules/local/process/bwa_mem.nf @@ -1,19 +1,21 @@ +// Import generic module functions include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' -environment = params.conda ? "bioconda::bwa=0.7.17 bioconda::samtools=1.10" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::bwa=0.7.17 bioconda::samtools=1.10" : null container = "quay.io/biocontainers/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:eabfac3657eda5818bae4090db989e3d41b01542-0" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:eabfac3657eda5818bae4090db989e3d41b01542-0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:eabfac3657eda5818bae4090db989e3d41b01542-0" process BWA_MEM { tag "${meta.id}" label 'process_high' - publishDir "${params.outdir}/bwa/${meta.sample}", - mode: params.publish_dir_mode, - saveAs: { filename -> - if (filename.endsWith('.version.txt')) null - else filename } + publishDir "${params.outdir}", mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + conda environment container container @@ -23,7 +25,6 @@ process BWA_MEM { path bwa path fasta path fai - val options output: tuple val(meta), path("*.bam"), emit: bam diff --git a/modules/local/process/bwamem2_mem.nf b/modules/local/process/bwamem2_mem.nf index d6f01ff770..7f7f306587 100644 --- a/modules/local/process/bwamem2_mem.nf +++ b/modules/local/process/bwamem2_mem.nf @@ -1,8 +1,11 @@ include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' -environment = params.conda ? "bioconda::bwa-mem2=2.0 bioconda::samtools=1.10" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::bwa-mem2=2.0 bioconda::samtools=1.10" : null container = "quay.io/biocontainers/mulled-v2-e5d375990341c5aef3c9aff74f96f66f65375ef6:876eb6f1d38fbf578296ea94e5aede4e317939e7-0" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/mulled-v2-e5d375990341c5aef3c9aff74f96f66f65375ef6:876eb6f1d38fbf578296ea94e5aede4e317939e7-0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/mulled-v2-e5d375990341c5aef3c9aff74f96f66f65375ef6:876eb6f1d38fbf578296ea94e5aede4e317939e7-0" process BWAMEM2_MEM { tag "${meta.id}" @@ -23,7 +26,6 @@ process BWAMEM2_MEM { path bwa path fasta path fai - val options output: tuple val(meta), path("*.bam") diff --git a/modules/local/process/concat_vcf.nf b/modules/local/process/concat_vcf.nf index b44abcd5c9..367eee1b0c 100644 --- a/modules/local/process/concat_vcf.nf +++ b/modules/local/process/concat_vcf.nf @@ -1,8 +1,11 @@ include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' -environment = params.conda ? "bioconda::htslib=1.11" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::htslib=1.11" : null container = "quay.io/biocontainers/htslib:1.11--hd3b49d5_0" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/htslib:1.11--hd3b49d5_0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/htslib:1.11--hd3b49d5_0" process CONCAT_VCF { label 'cpus_8' @@ -18,7 +21,6 @@ process CONCAT_VCF { tuple val(meta), path(vcf) path fai path bed - val options output: tuple val(meta), path("*_*.vcf.gz"), path("*_*.vcf.gz.tbi"), emit: vcf diff --git a/modules/local/process/create_intervals_bed.nf b/modules/local/process/create_intervals_bed.nf index 6d4470612e..e8e3cc5c0b 100644 --- a/modules/local/process/create_intervals_bed.nf +++ b/modules/local/process/create_intervals_bed.nf @@ -1,9 +1,12 @@ include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' include { has_extension } from '../functions' -environment = params.conda ? "anaconda::gawk=5.1.0" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "anaconda::gawk=5.1.0" : null container = "quay.io/biocontainers/gawk:5.1.0" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/gawk:5.1.0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/gawk:5.1.0" process CREATE_INTERVALS_BED { tag "${intervals}" diff --git a/modules/local/process/merge_bam.nf b/modules/local/process/merge_bam.nf index 58069773c8..b427ad4887 100644 --- a/modules/local/process/merge_bam.nf +++ b/modules/local/process/merge_bam.nf @@ -1,8 +1,11 @@ include { initOptions; saveFiles; getSoftwareName } from './../../nf-core/software/functions' -environment = params.conda ? "bioconda::samtools=1.10" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::samtools=1.10" : null container = "quay.io/biocontainers/samtools:1.10--h2e538c0_3" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/samtools:1.10--h2e538c0_3" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/samtools:1.10--h2e538c0_3" process MERGE_BAM { label 'cpus_8' @@ -14,7 +17,6 @@ process MERGE_BAM { input: tuple val(meta), path(bam) - val options output: tuple val(meta), path("${name}.bam"), emit: bam diff --git a/modules/local/subworkflow/build_indices.nf b/modules/local/subworkflow/build_indices.nf index ac48b9eb76..58c057977a 100644 --- a/modules/local/subworkflow/build_indices.nf +++ b/modules/local/subworkflow/build_indices.nf @@ -4,18 +4,29 @@ ================================================================================ */ -// And then initialize channels based on params or indices that were just built +params.build_intervals_options = [:] +params.bwa_index_options = [:] +params.bwamem2_index_options = [:] +params.create_intervals_bed_options = [:] +params.gatk_dict_options = [:] +params.samtools_faidx_options = [:] +params.tabix_dbsnp_options = [:] +params.tabix_germline_resource_options = [:] +params.tabix_known_indels_options = [:] +params.tabix_pon_options = [:] -include { BUILD_INTERVALS } from '../process/build_intervals.nf' -include { BWA_INDEX } from '../../nf-core/software/bwa/index/main.nf' -include { BWAMEM2_INDEX } from '../../nf-core/software/bwamem2_index.nf' -include { CREATE_INTERVALS_BED } from '../process/create_intervals_bed.nf' -include { GATK_CREATESEQUENCEDICTIONARY as GATK_DICT } from '../../nf-core/software/gatk/createsequencedictionary.nf' -include { HTSLIB_TABIX as TABIX_DBSNP; - HTSLIB_TABIX as TABIX_GERMLINE_RESOURCE; - HTSLIB_TABIX as TABIX_KNOWN_INDELS; - HTSLIB_TABIX as TABIX_PON;} from '../../nf-core/software/htslib_tabix' -include { SAMTOOLS_FAIDX } from '../../nf-core/software/samtools/faidx.nf' +// Initialize channels based on params or indices that were just built + +include { BUILD_INTERVALS } from '../process/build_intervals.nf' addParams(option: params.build_intervals_options) +include { BWA_INDEX } from '../../nf-core/software/bwa/index/main.nf' addParams(option: params.bwa_index_options) +include { BWAMEM2_INDEX } from '../../nf-core/software/bwamem2_index.nf' addParams(option: params.bwamem2_index_options) +include { CREATE_INTERVALS_BED } from '../process/create_intervals_bed.nf' addParams(option: params.create_intervals_bed_options) +include { GATK_CREATESEQUENCEDICTIONARY as GATK_DICT } from '../../nf-core/software/gatk/createsequencedictionary.nf' addParams(option: params.gatk_dict_options) +include { HTSLIB_TABIX as TABIX_DBSNP } from '../../nf-core/software/htslib_tabix' addParams(option: params.tabix_dbsnp_options) +include { HTSLIB_TABIX as TABIX_GERMLINE_RESOURCE } from '../../nf-core/software/htslib_tabix' addParams(option: params.tabix_germline_resource_options) +include { HTSLIB_TABIX as TABIX_KNOWN_INDELS } from '../../nf-core/software/htslib_tabix' addParams(option: params.tabix_known_indels_options) +include { HTSLIB_TABIX as TABIX_PON } from '../../nf-core/software/htslib_tabix' addParams(option: params.tabix_pon_options) +include { SAMTOOLS_FAIDX } from '../../nf-core/software/samtools/faidx.nf' addParams(option: params.samtools_faidx_options) workflow BUILD_INDICES{ take: @@ -23,7 +34,6 @@ workflow BUILD_INDICES{ fasta // channel: [mandatory] fasta germline_resource // channel: [optional] germline_resource known_indels // channel: [optional] known_indels - modules // map: [mandatory] options for modules pon // channel: [optional] pon step // value: [mandatory] starting step tools // list: [optional] tools to run @@ -33,32 +43,32 @@ workflow BUILD_INDICES{ result_bwa = Channel.empty() version_bwa = Channel.empty() if (!(params.bwa) && 'mapping' in step) - if (params.aligner == "bwa-mem") (result_bwa, version_bwa) = BWA_INDEX(fasta, modules['bwa_index']) - else result_bwa = BWAMEM2_INDEX(fasta, modules['bwamem2_index']) + if (params.aligner == "bwa-mem") (result_bwa, version_bwa) = BWA_INDEX(fasta) + else result_bwa = BWAMEM2_INDEX(fasta) result_dict = Channel.empty() if (!(params.dict) && !('annotate' in step) && !('controlfreec' in step)) - result_dict = GATK_DICT(fasta, modules['gatk_createsequencedictionary']) + result_dict = GATK_DICT(fasta) result_fai = Channel.empty() if (!(params.fasta_fai) && !('annotate' in step)) - result_fai = SAMTOOLS_FAIDX(fasta, modules['samtools_faidx']) + result_fai = SAMTOOLS_FAIDX(fasta) result_dbsnp_tbi = Channel.empty() if (!(params.dbsnp_index) && params.dbsnp && ('mapping' in step || 'preparerecalibration' in step || 'controlfreec' in tools || 'haplotypecaller' in tools || 'mutect2' in tools || 'tnscope' in tools)) - result_dbsnp_tbi = TABIX_DBSNP(dbsnp, modules['htslib_tabix']) + result_dbsnp_tbi = TABIX_DBSNP(dbsnp) result_germline_resource_tbi = Channel.empty() if (!(params.germline_resource_index) && params.germline_resource && 'mutect2' in tools) - result_germline_resource_tbi = TABIX_GERMLINE_RESOURCE(germline_resource, modules['htslib_tabix']) + result_germline_resource_tbi = TABIX_GERMLINE_RESOURCE(germline_resource) result_known_indels_tbi = Channel.empty() if (!(params.known_indels_index) && params.known_indels && ('mapping' in step || 'preparerecalibration' in step)) - result_known_indels_tbi = TABIX_KNOWN_INDELS(known_indels, modules['htslib_tabix']) + result_known_indels_tbi = TABIX_KNOWN_INDELS(known_indels) result_pon_tbi = Channel.empty() if (!(params.pon_index) && params.pon && ('tnscope' in tools || 'mutect2' in tools)) - result_pon_tbi = TABIX_PON(pon, modules['htslib_tabix']) + result_pon_tbi = TABIX_PON(pon) if (params.no_intervals) { file("${params.outdir}/no_intervals.bed").text = "no_intervals\n" @@ -98,4 +108,4 @@ workflow BUILD_INDICES{ intervals = result_intervals known_indels_tbi = result_known_indels_tbi pon_tbi = result_pon_tbi -} +} \ No newline at end of file diff --git a/modules/local/subworkflow/germline_variant_calling.nf b/modules/local/subworkflow/germline_variant_calling.nf index ba48c158aa..8fcd21b50e 100644 --- a/modules/local/subworkflow/germline_variant_calling.nf +++ b/modules/local/subworkflow/germline_variant_calling.nf @@ -4,11 +4,17 @@ ================================================================================ */ -include { GATK_HAPLOTYPECALLER as HAPLOTYPECALLER } from '../../nf-core/software/gatk/haplotypecaller' -include { GATK_GENOTYPEGVCF as GENOTYPEGVCF } from '../../nf-core/software/gatk/genotypegvcf' -include { CONCAT_VCF as CONCAT_GVCF; - CONCAT_VCF as CONCAT_HAPLOTYPECALLER} from '../process/concat_vcf' -include { STRELKA_GERMLINE as STRELKA } from '../../nf-core/software/strelka/germline' +params.haplotypecaller_options = [:] +params.genotypegvcf_options = [:] +params.concat_gvcf_options = [:] +params.concat_haplotypecaller_options = [:] +params.strelka_options = [:] + +include { GATK_HAPLOTYPECALLER as HAPLOTYPECALLER } from '../../nf-core/software/gatk/haplotypecaller' addParams(option: params.haplotypecaller_options) +include { GATK_GENOTYPEGVCF as GENOTYPEGVCF } from '../../nf-core/software/gatk/genotypegvcf' addParams(option: params.genotypegvcf_options) +include { CONCAT_VCF as CONCAT_GVCF } from '../process/concat_vcf' addParams(option: params.concat_gvcf_options) +include { CONCAT_VCF as CONCAT_HAPLOTYPECALLER } from '../process/concat_vcf' addParams(option: params.concat_haplotypecaller_options) +include { STRELKA_GERMLINE as STRELKA } from '../../nf-core/software/strelka/germline' addParams(option: params.strelka_options) workflow GERMLINE_VARIANT_CALLING { take: @@ -19,7 +25,6 @@ workflow GERMLINE_VARIANT_CALLING { fai // channel: [mandatory] fai fasta // channel: [mandatory] fasta intervals // channel: [mandatory] intervals - modules // map: [mandatory] maps for modules target_bed // channel: [optional] target_bed tools // list: [mandatory] list of tools @@ -63,8 +68,7 @@ workflow GERMLINE_VARIANT_CALLING { CONCAT_GVCF( haplotypecaller_interval_gvcf, fai, - target_bed, - modules['concat_vcf_haplotypecallergvcf']) + target_bed) haplotypecaller_gvcf = CONCAT_GVCF.out.vcf @@ -99,8 +103,7 @@ workflow GERMLINE_VARIANT_CALLING { CONCAT_HAPLOTYPECALLER( haplotypecaller_interval_vcf, fai, - target_bed, - modules['concat_vcf_haplotypecaller']) + target_bed) haplotypecaller_vcf = CONCAT_GVCF.out.vcf } @@ -110,8 +113,7 @@ workflow GERMLINE_VARIANT_CALLING { bam, fasta, fai, - target_bed, - modules['strelka_germline']) + target_bed) strelka_vcf = STRELKA.out.vcf } diff --git a/modules/local/subworkflow/mapping.nf b/modules/local/subworkflow/mapping.nf index afe41428d8..4ca5effd2f 100644 --- a/modules/local/subworkflow/mapping.nf +++ b/modules/local/subworkflow/mapping.nf @@ -4,12 +4,19 @@ ================================================================================ */ -include { BWAMEM2_MEM } from '../process/bwamem2_mem' -include { BWA_MEM as BWAMEM1_MEM } from '../process/bwa_mem' -include { MERGE_BAM } from '../process/merge_bam' -include { QUALIMAP_BAMQC } from '../../nf-core/software/qualimap_bamqc' -include { SAMTOOLS_INDEX } from '../../nf-core/software/samtools/index' -include { SAMTOOLS_STATS } from '../../nf-core/software/samtools/stats' +params.bwamem1_mem_options = [:] +params.bwamem2_mem_options = [:] +params.merge_bam_options = [:] +params.qualimap_bamqc_options = [:] +params.samtools_index_options = [:] +params.samtools_stats_options = [:] + +include { BWA_MEM as BWAMEM1_MEM } from '../process/bwa_mem' addParams(options: params.bwamem1_mem_options) +include { BWAMEM2_MEM } from '../process/bwamem2_mem' addParams(options: params.bwamem2_mem_options) +include { MERGE_BAM } from '../process/merge_bam' addParams(options: params.merge_bam_options) +include { QUALIMAP_BAMQC } from '../../nf-core/software/qualimap_bamqc' addParams(options: params.qualimap_bamqc_options) +include { SAMTOOLS_INDEX } from '../../nf-core/software/samtools/index' addParams(options: params.samtools_index_options) +include { SAMTOOLS_STATS } from '../../nf-core/software/samtools/stats' addParams(options: params.samtools_stats_options) workflow MAPPING { take: @@ -18,7 +25,6 @@ workflow MAPPING { bwa // channel: [mandatory] bwa fai // channel: [mandatory] fai fasta // channel: [mandatory] fasta - modules // map: options for modules reads_input // channel: [mandatory] reads_input save_bam_mapped // boolean: true/false step // value: [mandatory] starting step @@ -34,10 +40,10 @@ workflow MAPPING { bam_bwamem2 = Channel.empty() if (params.aligner == "bwa-mem") { - BWAMEM1_MEM(reads_input, bwa, fasta, fai, modules['bwa_mem']) + BWAMEM1_MEM(reads_input, bwa, fasta, fai) bam_bwamem1 = BWAMEM1_MEM.out.bam } else { - BWAMEM2_MEM(reads_input, bwa, fasta, fai, modules['bwamem2_mem']) + BWAMEM2_MEM(reads_input, bwa, fasta, fai) bam_bwamem2 = BWAMEM2_MEM.out } @@ -83,9 +89,9 @@ workflow MAPPING { // STEP 1.5: MERGING AND INDEXING BAM FROM MULTIPLE LANES - MERGE_BAM(bam_bwa_multiple, modules['merge_bam_mapping']) + MERGE_BAM(bam_bwa_multiple) bam_mapped = bam_bwa_single.mix(MERGE_BAM.out.bam) - bam_mapped_index = SAMTOOLS_INDEX(bam_mapped, modules['samtools_index_mapping']) + bam_mapped_index = SAMTOOLS_INDEX(bam_mapped) qualimap_bamqc = Channel.empty() samtools_stats = Channel.empty() diff --git a/modules/local/subworkflow/markduplicates.nf b/modules/local/subworkflow/markduplicates.nf index 47e550c960..b0cf70b702 100644 --- a/modules/local/subworkflow/markduplicates.nf +++ b/modules/local/subworkflow/markduplicates.nf @@ -4,7 +4,9 @@ ================================================================================ */ -include { GATK_MARKDUPLICATES } from '../../nf-core/software/gatk/markduplicates' +params.gatk_markduplicates_options = [:] + +include { GATK_MARKDUPLICATES } from '../../nf-core/software/gatk/markduplicates' addParams(options: params.gatk_markduplicates_options) workflow MARKDUPLICATES { take: diff --git a/modules/local/subworkflow/prepare_recalibration.nf b/modules/local/subworkflow/prepare_recalibration.nf index 30817bc0be..510e4bbb55 100644 --- a/modules/local/subworkflow/prepare_recalibration.nf +++ b/modules/local/subworkflow/prepare_recalibration.nf @@ -4,8 +4,11 @@ ================================================================================ */ -include { GATK_BASERECALIBRATOR as BASERECALIBRATOR } from '../../nf-core/software/gatk/baserecalibrator' -include { GATK_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from '../../nf-core/software/gatk/gatherbqsrreports' +params.baserecalibrator_options = [:] +params.gatherbqsrreports_options = [:] + +include { GATK_BASERECALIBRATOR as BASERECALIBRATOR } from '../../nf-core/software/gatk/baserecalibrator' addParams(options: params.baserecalibrator_options) +include { GATK_GATHERBQSRREPORTS as GATHERBQSRREPORTS } from '../../nf-core/software/gatk/gatherbqsrreports' addParams(options: params.gatherbqsrreports_options) workflow PREPARE_RECALIBRATION { take: diff --git a/modules/local/subworkflow/recalibrate.nf b/modules/local/subworkflow/recalibrate.nf index f910120fd0..2f62a6e562 100644 --- a/modules/local/subworkflow/recalibrate.nf +++ b/modules/local/subworkflow/recalibrate.nf @@ -4,11 +4,17 @@ ================================================================================ */ -include { GATK_APPLYBQSR as APPLYBQSR } from '../../nf-core/software/gatk/applybqsr' -include { MERGE_BAM } from '../process/merge_bam' -include { SAMTOOLS_STATS } from '../../nf-core/software/samtools/stats' -include { SAMTOOLS_INDEX } from '../../nf-core/software/samtools/index' -include { QUALIMAP_BAMQC } from '../../nf-core/software/qualimap_bamqc' +params.applybqsr_options = [:] +params.merge_bam_options = [:] +params.qualimap_bamqc_options = [:] +params.samtools_index_options = [:] +params.samtools_stats_options = [:] + +include { GATK_APPLYBQSR as APPLYBQSR } from '../../nf-core/software/gatk/applybqsr' addParams(options: params.applybqsr_options) +include { MERGE_BAM } from '../process/merge_bam' addParams(options: params.merge_bam_options) +include { QUALIMAP_BAMQC } from '../../nf-core/software/qualimap_bamqc' addParams(options: params.qualimap_bamqc_options) +include { SAMTOOLS_INDEX } from '../../nf-core/software/samtools/index' addParams(options: params.samtools_index_options) +include { SAMTOOLS_STATS } from '../../nf-core/software/samtools/stats' addParams(options: params.samtools_stats_options) workflow RECALIBRATE { take: @@ -19,7 +25,6 @@ workflow RECALIBRATE { fai // channel: [mandatory] fai fasta // channel: [mandatory] fasta intervals // channel: [mandatory] intervals - modules // map: options for modules step // value: [mandatory] starting step target_bed // channel: [optional] target_bed @@ -61,12 +66,12 @@ workflow RECALIBRATE { [meta, bam] } - MERGE_BAM(bam_recalibrated_interval, modules['merge_bam_recalibrate']) + MERGE_BAM(bam_recalibrated_interval) bam_recalibrated = MERGE_BAM.out.bam tsv_recalibrated = MERGE_BAM.out.tsv } - bam_recalibrated_index = SAMTOOLS_INDEX(bam_recalibrated, modules['samtools_index_recalibrate']) + bam_recalibrated_index = SAMTOOLS_INDEX(bam_recalibrated) qualimap_bamqc = Channel.empty() samtools_stats = Channel.empty() diff --git a/modules/nf-core/software/bwa/index/main.nf b/modules/nf-core/software/bwa/index/main.nf index 869ee420c0..91b86cfadf 100644 --- a/modules/nf-core/software/bwa/index/main.nf +++ b/modules/nf-core/software/bwa/index/main.nf @@ -1,8 +1,11 @@ include { initOptions; saveFiles; getSoftwareName } from './../../functions' -environment = params.conda ? "bioconda::bwa=0.7.17" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::bwa=0.7.17" : null container = "quay.io/biocontainers/bwa:0.7.17--hed695b0_7" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/bwa:0.7.17--hed695b0_7" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/bwa:0.7.17--hed695b0_7" process BWA_INDEX { tag "${fasta}" @@ -17,7 +20,6 @@ process BWA_INDEX { input: path fasta - val options output: path "${fasta}.*" , emit: index diff --git a/modules/nf-core/software/bwamem2_index.nf b/modules/nf-core/software/bwamem2_index.nf index 4e49ff773a..35fe9ac79e 100644 --- a/modules/nf-core/software/bwamem2_index.nf +++ b/modules/nf-core/software/bwamem2_index.nf @@ -1,8 +1,11 @@ include { initOptions; saveFiles; getSoftwareName } from './functions' -environment = params.conda ? "bioconda::bwa-mem2=2.0" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::bwa-mem2=2.0" : null container = "quay.io/biocontainers/bwa-mem2:2.0--he513fc3_1" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/bwa-mem2:2.0--he513fc3_1" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/bwa-mem2:2.0--he513fc3_1" process BWAMEM2_INDEX { tag "${fasta}" @@ -15,7 +18,6 @@ process BWAMEM2_INDEX { input: path fasta - val options output: path "${fasta}.*" diff --git a/modules/nf-core/software/fastqc.nf b/modules/nf-core/software/fastqc.nf index 3205de93db..6b0ac4f0dc 100644 --- a/modules/nf-core/software/fastqc.nf +++ b/modules/nf-core/software/fastqc.nf @@ -1,8 +1,11 @@ include { initOptions; saveFiles; getSoftwareName } from './functions' -environment = params.conda ? "bioconda::fastqc=0.11.9" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::fastqc=0.11.9" : null container = "quay.io/biocontainers/fastqc:0.11.9--0" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0" process FASTQC { tag "${meta.id}" @@ -21,7 +24,6 @@ process FASTQC { input: tuple val(meta), path(reads) - val options output: path "*.html", emit: html diff --git a/modules/nf-core/software/gatk/applybqsr.nf b/modules/nf-core/software/gatk/applybqsr.nf index df12da4336..0861b22028 100644 --- a/modules/nf-core/software/gatk/applybqsr.nf +++ b/modules/nf-core/software/gatk/applybqsr.nf @@ -1,8 +1,11 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' -environment = params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::gatk4-spark=4.1.8.1" : null container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" process GATK_APPLYBQSR { label 'memory_singleCPU_2_task' diff --git a/modules/nf-core/software/gatk/baserecalibrator.nf b/modules/nf-core/software/gatk/baserecalibrator.nf index fb780f095b..b02ea77b41 100644 --- a/modules/nf-core/software/gatk/baserecalibrator.nf +++ b/modules/nf-core/software/gatk/baserecalibrator.nf @@ -1,8 +1,11 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' -environment = params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::gatk4-spark=4.1.8.1" : null container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" process GATK_BASERECALIBRATOR { label 'cpus_1' diff --git a/modules/nf-core/software/gatk/createsequencedictionary.nf b/modules/nf-core/software/gatk/createsequencedictionary.nf index b22f2c6e6e..0129e33b26 100644 --- a/modules/nf-core/software/gatk/createsequencedictionary.nf +++ b/modules/nf-core/software/gatk/createsequencedictionary.nf @@ -1,8 +1,11 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' -environment = params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::gatk4-spark=4.1.8.1" : null container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" process GATK_CREATESEQUENCEDICTIONARY { tag "${fasta}" @@ -15,7 +18,6 @@ process GATK_CREATESEQUENCEDICTIONARY { input: path fasta - val options output: path "${fasta.baseName}.dict" diff --git a/modules/nf-core/software/gatk/gatherbqsrreports.nf b/modules/nf-core/software/gatk/gatherbqsrreports.nf index 65c07fb5e2..c9686b0a8b 100644 --- a/modules/nf-core/software/gatk/gatherbqsrreports.nf +++ b/modules/nf-core/software/gatk/gatherbqsrreports.nf @@ -1,8 +1,11 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' -environment = params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::gatk4-spark=4.1.8.1" : null container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" process GATK_GATHERBQSRREPORTS { label 'memory_singleCPU_2_task' diff --git a/modules/nf-core/software/gatk/genotypegvcf.nf b/modules/nf-core/software/gatk/genotypegvcf.nf index cc418cfbf4..32f3fd5485 100644 --- a/modules/nf-core/software/gatk/genotypegvcf.nf +++ b/modules/nf-core/software/gatk/genotypegvcf.nf @@ -1,8 +1,11 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' -environment = params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::gatk4-spark=4.1.8.1" : null container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" process GATK_GENOTYPEGVCF { tag "${meta.id}-${interval.baseName}" diff --git a/modules/nf-core/software/gatk/haplotypecaller.nf b/modules/nf-core/software/gatk/haplotypecaller.nf index ee0186ba13..e0dd960803 100644 --- a/modules/nf-core/software/gatk/haplotypecaller.nf +++ b/modules/nf-core/software/gatk/haplotypecaller.nf @@ -1,8 +1,11 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' -environment = params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::gatk4-spark=4.1.8.1" : null container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" process GATK_HAPLOTYPECALLER { label 'MEMORY_SINGLECPU_TASK_SQ' diff --git a/modules/nf-core/software/gatk/markduplicates.nf b/modules/nf-core/software/gatk/markduplicates.nf index 9f457a534a..b57ebcc44c 100644 --- a/modules/nf-core/software/gatk/markduplicates.nf +++ b/modules/nf-core/software/gatk/markduplicates.nf @@ -1,8 +1,11 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' -environment = params.conda ? "bioconda::gatk4-spark=4.1.8.1" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::gatk4-spark=4.1.8.1" : null container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" process GATK_MARKDUPLICATES { label 'cpus_16' diff --git a/modules/nf-core/software/htslib_tabix.nf b/modules/nf-core/software/htslib_tabix.nf index ceaeba3f5d..e1b514f248 100644 --- a/modules/nf-core/software/htslib_tabix.nf +++ b/modules/nf-core/software/htslib_tabix.nf @@ -1,8 +1,12 @@ +// Import generic module functions include { initOptions; saveFiles; getSoftwareName } from './functions' -environment = params.conda ? "bioconda::tabix=0.2.6" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::tabix=0.2.6" : null container = "quay.io/biocontainers/tabix:0.2.6--ha92aebf_0" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/tabix:0.2.6--ha92aebf_0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/tabix:0.2.6--ha92aebf_0" process HTSLIB_TABIX { tag "${vcf}" @@ -15,14 +19,12 @@ process HTSLIB_TABIX { input: path vcf - val options output: path "${vcf}.tbi" script: def software = getSoftwareName(task.process) - def ioptions = initOptions(options) """ tabix -p vcf ${vcf} diff --git a/modules/nf-core/software/multiqc.nf b/modules/nf-core/software/multiqc.nf index a8868d74d7..ed201b0f49 100644 --- a/modules/nf-core/software/multiqc.nf +++ b/modules/nf-core/software/multiqc.nf @@ -1,8 +1,11 @@ include { initOptions; saveFiles; getSoftwareName } from './functions' -environment = params.conda ? "bioconda::multiqc=1.9" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::multiqc=1.9" : null container = "quay.io/biocontainers/multiqc:1.9--py_1" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/multiqc:1.9--py_1" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/multiqc:1.9--py_1" // Has the run name been specified by the user? // this has the bonus effect of catching both -name and --name diff --git a/modules/nf-core/software/qualimap_bamqc.nf b/modules/nf-core/software/qualimap_bamqc.nf index 515dc22cf6..3f5070e605 100644 --- a/modules/nf-core/software/qualimap_bamqc.nf +++ b/modules/nf-core/software/qualimap_bamqc.nf @@ -1,8 +1,11 @@ include { initOptions; saveFiles; getSoftwareName } from './functions' -environment = params.conda ? "bioconda::qualimap=2.2.2d" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::qualimap=2.2.2d" : null container = "quay.io/biocontainers/qualimap:2.2.2d--1" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/qualimap:2.2.2d--1" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/qualimap:2.2.2d--1" process QUALIMAP_BAMQC { label 'memory_max' diff --git a/modules/nf-core/software/samtools/faidx.nf b/modules/nf-core/software/samtools/faidx.nf index 893c4ec5a9..ab08b37c2f 100644 --- a/modules/nf-core/software/samtools/faidx.nf +++ b/modules/nf-core/software/samtools/faidx.nf @@ -1,8 +1,11 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' -environment = params.conda ? "bioconda::samtools=1.10" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::samtools=1.10" : null container = "quay.io/biocontainers/samtools:1.10--h2e538c0_3" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/samtools:1.10--h2e538c0_3" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/samtools:1.10--h2e538c0_3" process SAMTOOLS_FAIDX { tag "${fasta}" @@ -15,7 +18,6 @@ process SAMTOOLS_FAIDX { input: path fasta - val options output: path "${fasta}.fai" diff --git a/modules/nf-core/software/samtools/index.nf b/modules/nf-core/software/samtools/index.nf index 4b042bb1dd..ffb3564b35 100644 --- a/modules/nf-core/software/samtools/index.nf +++ b/modules/nf-core/software/samtools/index.nf @@ -1,8 +1,11 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' -environment = params.conda ? "bioconda::samtools=1.10" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::samtools=1.10" : null container = "quay.io/biocontainers/samtools:1.10--h2e538c0_3" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/samtools:1.10--h2e538c0_3" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/samtools:1.10--h2e538c0_3" process SAMTOOLS_INDEX { label 'cpus_8' @@ -20,7 +23,6 @@ process SAMTOOLS_INDEX { input: tuple val(meta), path(bam) - val options output: tuple val(meta), path("${name}.bam"), path("*.bai") diff --git a/modules/nf-core/software/samtools/stats.nf b/modules/nf-core/software/samtools/stats.nf index 6138d07a05..aeaa06931d 100644 --- a/modules/nf-core/software/samtools/stats.nf +++ b/modules/nf-core/software/samtools/stats.nf @@ -1,8 +1,11 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' -environment = params.conda ? "bioconda::samtools=1.10" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::samtools=1.10" : null container = "quay.io/biocontainers/samtools:1.10--h2e538c0_3" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/samtools:1.10--h2e538c0_3" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/samtools:1.10--h2e538c0_3" process SAMTOOLS_STATS { label 'cpus_2' diff --git a/modules/nf-core/software/strelka/germline.nf b/modules/nf-core/software/strelka/germline.nf index 6ae56fc59a..5f6d024160 100644 --- a/modules/nf-core/software/strelka/germline.nf +++ b/modules/nf-core/software/strelka/germline.nf @@ -1,11 +1,14 @@ include { initOptions; saveFiles; getSoftwareName } from './../functions' -environment = params.conda ? "bioconda::strelka=2.9.10" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::strelka=2.9.10" : null container = "quay.io/biocontainers/strelka:2.9.10--0" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/strelka:2.9.10--0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/strelka:2.9.10--0" process STRELKA_GERMLINE { - tag "$meta.id" + tag "${meta.id}" label 'CPUS_MAX' label 'MEMORY_MAX' @@ -18,16 +21,15 @@ process STRELKA_GERMLINE { container container input: - tuple val(meta), path(bam), path (bai) - path fasta - path fai - path target_bed - val options + tuple val(meta), path(bam), path (bai) + path fasta + path fai + path target_bed output: - tuple val(meta), path("*_variants.vcf.gz"), path("*_variants.vcf.gz.tbi"), emit: vcf - tuple val(meta), path("*_genome.vcf.gz"), path("*_genome.vcf.gz.tbi"), emit: genome_vcf - path "*.version.txt", emit: version + tuple val(meta), path("*_variants.vcf.gz"), path("*_variants.vcf.gz.tbi"), emit: vcf + tuple val(meta), path("*_genome.vcf.gz"), path("*_genome.vcf.gz.tbi"), emit: genome_vcf + path "*.version.txt", emit: version script: def software = getSoftwareName(task.process) diff --git a/modules/nf-core/software/trimgalore.nf b/modules/nf-core/software/trimgalore.nf index 432ccfde08..7a233bd14c 100644 --- a/modules/nf-core/software/trimgalore.nf +++ b/modules/nf-core/software/trimgalore.nf @@ -1,8 +1,11 @@ include { initOptions; saveFiles; getSoftwareName } from './functions' -environment = params.conda ? "bioconda::trim-galore=0.6.5" : null +params.options = [:] +def options = initOptions(params.options) + +environment = params.enable_conda ? "bioconda::trim-galore=0.6.5" : null container = "quay.io/biocontainers/trim-galore:0.6.5--0" -if (workflow.containerEngine == 'singularity') container = "https://depot.galaxyproject.org/singularity/trim-galore:0.6.5--0" +if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/trim-galore:0.6.5--0" process TRIMGALORE { tag "${meta.id}" @@ -20,7 +23,6 @@ process TRIMGALORE { input: tuple val(meta), path(reads) - val options output: tuple val(meta), path("*_1.fq.gz"), path("*_2.fq.gz"), emit: reads diff --git a/modules/nf-core/subworkflow/qc_trim.nf b/modules/nf-core/subworkflow/qc_trim.nf index 961065a864..b39476c5ef 100644 --- a/modules/nf-core/subworkflow/qc_trim.nf +++ b/modules/nf-core/subworkflow/qc_trim.nf @@ -1,9 +1,11 @@ /* * Read QC and trimming */ +params.fastqc_options = [:] +params.trimgalore_options = [:] -include { FASTQC } from '../software/fastqc' -include { TRIMGALORE } from '../software/trimgalore' +include { FASTQC } from '../software/fastqc' addParams(options: params.fastqc_options) +include { TRIMGALORE } from '../software/trimgalore' addParams(options: params.trimgalore_options) workflow QC_TRIM { take: @@ -11,7 +13,6 @@ workflow QC_TRIM { reads // channel: [ val(meta), [ reads ] ] skip_fastqc // boolean: true/false skip_trimming // boolean: true/false - modules // map: options for modules main: @@ -19,7 +20,7 @@ workflow QC_TRIM { fastqc_version = Channel.empty() fastqc_zip = Channel.empty() if (!skip_fastqc) { - FASTQC(reads, modules['fastqc']) + FASTQC(reads) fastqc_html = FASTQC.out.html fastqc_version = FASTQC.out.version fastqc_zip = FASTQC.out.zip @@ -31,7 +32,7 @@ workflow QC_TRIM { trimgalore_log = Channel.empty() trimgalore_version = Channel.empty() if (!skip_trimming) { - TRIMGALORE(reads, modules['trimgalore']) + TRIMGALORE(reads) trim_reads = TRIMGALORE.out.reads trimgalore_html = TRIMGALORE.out.html trimgalore_zip = TRIMGALORE.out.zip diff --git a/nextflow.config b/nextflow.config index 9878efde48..3c131ce9c5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -90,11 +90,13 @@ params { email_on_fail = false plaintext_email = false // Plaintext email disabled max_multiqc_email_size = 25.MB - hostnames = false name = false // No default name tracedir = "${params.outdir}/pipeline_info" + // Singularity containers + pull_docker_container = false // Pull default container by default + // Base specifications // Defaults only, expecting to be overwritten cpus = 8 @@ -133,7 +135,7 @@ profiles { conda { docker.enabled = false singularity.enabled = false - params.conda = true + params.enable_conda = true } debug { process.beforeScript = 'echo $HOSTNAME' } docker { @@ -142,13 +144,13 @@ profiles { fixOwnership = true } singularity.enabled = false - params.conda = false + params.enable_conda = false } singularity { docker.enabled = false singularity.autoMounts = true singularity.enabled = true - params.conda = false + params.enable_conda = false } test { includeConfig 'conf/test.config' } test_annotation { includeConfig 'conf/test_annotation.config' } From 115655fcbb02863aca7ec0156081b9ce1622186a Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 15 Oct 2020 13:29:31 +0200 Subject: [PATCH 193/200] chores: complete JSON schema --- nextflow_schema.json | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 577e125a78..5df62372fa 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -39,7 +39,7 @@ "outdir": { "type": "string", "description": "The output directory where the results will be saved.", - "default": "./results", + "default": "./results", "fa_icon": "fas fa-folder-open" }, "email": { @@ -152,7 +152,8 @@ "clip_r2": { "type": "integer", "description": "Remove bp from the 5' end of read 5", - "help_text": "With Trim Galore" + "help_text": "With Trim Galore", + "fa_icon": "fas fa-cut" }, "three_prime_clip_r1": { "type": "integer", @@ -500,7 +501,7 @@ "igenomes_base": { "type": "string", "description": "Directory / URL base for iGenomes references.", - "default": "s3://ngi-igenomes/igenomes/", + "default": "s3://ngi-igenomes/igenomes/", "fa_icon": "fas fa-cloud-download-alt", "hidden": true }, @@ -596,7 +597,7 @@ "tracedir": { "type": "string", "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/pipeline_info", + "default": "${params.outdir}/pipeline_info", "fa_icon": "fas fa-cogs", "hidden": true }, @@ -669,7 +670,7 @@ "custom_config_base": { "type": "string", "description": "Base directory for Institutional configs.", - "default": "https://raw.githubusercontent.com/nf-core/configs/master", + "default": "https://raw.githubusercontent.com/nf-core/configs/master", "hidden": true, "help_text": "If you're running offline, nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell nextflow where to find them with the `custom_config_base` option. For example:\n\n```bash\n## Download and unzip the config files\ncd /path/to/my/configs\nwget https://github.com/nf-core/configs/archive/master.zip\nunzip master.zip\n\n## Run the pipeline\ncd /path/to/my/data\nnextflow run /path/to/pipeline/ --custom_config_base /path/to/my/configs/configs-master/\n```\n\n> Note that the nf-core/tools helper package has a `download` command to download all required pipeline files + singularity containers + institutional configs in one go for you, to make this process easier.", "fa_icon": "fas fa-users-cog" @@ -699,6 +700,22 @@ "fa_icon": "fas fa-users-cog" } } + }, + "modules": { + "title": "Modules", + "type": "object", + "description": "", + "default": "", + "properties": { + "pull_docker_container": { + "type": "boolean", + "fa_icon": "fab fa-docker", + "description": "Force pull and use of Docker container instead of default Singularity ones", + "hidden": true, + "help_text": "This may be useful if you are unable to download Singularity containers due to proxy issues." + } + }, + "fa_icon": "fas fa-cog" } }, "allOf": [ @@ -731,6 +748,9 @@ }, { "$ref": "#/definitions/institutional_config_options" + }, + { + "$ref": "#/definitions/modules" } ] } \ No newline at end of file From 552dc2aab583da0d0c1498ca25a55ddb430eed7e Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 15 Oct 2020 14:32:56 +0200 Subject: [PATCH 194/200] chores: updates modules to new syntax --- .github/workflows/ci.yml | 8 ++++---- modules/local/process/build_intervals.nf | 2 +- modules/local/process/bwa_mem.nf | 2 +- modules/local/process/bwamem2_mem.nf | 7 ++----- modules/local/process/concat_vcf.nf | 3 ++- modules/local/process/create_intervals_bed.nf | 3 +++ modules/local/process/merge_bam.nf | 3 +++ modules/nf-core/software/bwa/index/main.nf | 2 +- modules/nf-core/software/bwamem2_index.nf | 2 +- modules/nf-core/software/fastqc.nf | 8 ++------ modules/nf-core/software/gatk/applybqsr.nf | 3 +++ modules/nf-core/software/gatk/baserecalibrator.nf | 3 +++ modules/nf-core/software/gatk/createsequencedictionary.nf | 2 +- modules/nf-core/software/gatk/gatherbqsrreports.nf | 5 +---- modules/nf-core/software/gatk/genotypegvcf.nf | 3 +++ modules/nf-core/software/gatk/haplotypecaller.nf | 3 +++ modules/nf-core/software/gatk/markduplicates.nf | 5 +---- modules/nf-core/software/htslib_tabix.nf | 2 +- modules/nf-core/software/qualimap_bamqc.nf | 3 ++- modules/nf-core/software/samtools/faidx.nf | 2 +- modules/nf-core/software/samtools/index.nf | 5 +---- modules/nf-core/software/samtools/stats.nf | 3 ++- modules/nf-core/software/strelka/germline.nf | 5 ++--- modules/nf-core/software/trimgalore.nf | 8 ++------ 24 files changed, 46 insertions(+), 46 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4e3385947f..ed1a667eeb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -71,13 +71,13 @@ jobs: NXF_VER: '20.07.1' - name: Get test data run: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data - - name: Run germline test --step mapping + - name: Run germline test with ${{ matrix.markduplicates }} --step mapping run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input data/testdata/tiny/normal --save_bam_mapped - - name: Run germline test --step prepare_recalibration + - name: Run germline test with ${{ matrix.markduplicates }} --step prepare_recalibration run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step prepare_recalibration -resume - - name: Run germline test --step recalibrate + - name: Run germline test with ${{ matrix.markduplicates }} --step recalibrate run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step recalibrate -resume - - name: Run germline test --step variantCalling + - name: Run germline test with ${{ matrix.markduplicates }} --step variantCalling run: nextflow run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.markduplicates }} --input=false --step variantCalling # annotation: diff --git a/modules/local/process/build_intervals.nf b/modules/local/process/build_intervals.nf index 993db4670e..697e0ec3d9 100644 --- a/modules/local/process/build_intervals.nf +++ b/modules/local/process/build_intervals.nf @@ -11,7 +11,7 @@ if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) process BUILD_INTERVALS { tag fai - publishDir "${params.outdir}", mode: params.publish_dir_mode, + publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } conda environment diff --git a/modules/local/process/bwa_mem.nf b/modules/local/process/bwa_mem.nf index dd654d10f7..305490de21 100644 --- a/modules/local/process/bwa_mem.nf +++ b/modules/local/process/bwa_mem.nf @@ -13,7 +13,7 @@ process BWA_MEM { label 'process_high' - publishDir "${params.outdir}", mode: params.publish_dir_mode, + publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } diff --git a/modules/local/process/bwamem2_mem.nf b/modules/local/process/bwamem2_mem.nf index 7f7f306587..a25836f36f 100644 --- a/modules/local/process/bwamem2_mem.nf +++ b/modules/local/process/bwamem2_mem.nf @@ -12,11 +12,8 @@ process BWAMEM2_MEM { label 'process_high' - publishDir "${params.outdir}/bwamem2/${meta.sample}", - mode: params.publish_dir_mode, - saveAs: { filename -> - if (filename.endsWith('.version.txt')) null - else filename } + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } conda environment container container diff --git a/modules/local/process/concat_vcf.nf b/modules/local/process/concat_vcf.nf index 367eee1b0c..35a3af5540 100644 --- a/modules/local/process/concat_vcf.nf +++ b/modules/local/process/concat_vcf.nf @@ -12,7 +12,8 @@ process CONCAT_VCF { tag "${options.publish_dir}-${meta.id}" - publishDir "${params.outdir}/VariantCalling/${meta.id}/${options.publish_dir}", mode: params.publish_dir_mode + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } conda environment container container diff --git a/modules/local/process/create_intervals_bed.nf b/modules/local/process/create_intervals_bed.nf index e8e3cc5c0b..d90a0a900a 100644 --- a/modules/local/process/create_intervals_bed.nf +++ b/modules/local/process/create_intervals_bed.nf @@ -11,6 +11,9 @@ if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) process CREATE_INTERVALS_BED { tag "${intervals}" + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + conda environment container container diff --git a/modules/local/process/merge_bam.nf b/modules/local/process/merge_bam.nf index b427ad4887..e190d8dbd1 100644 --- a/modules/local/process/merge_bam.nf +++ b/modules/local/process/merge_bam.nf @@ -12,6 +12,9 @@ process MERGE_BAM { tag "${meta.id}" + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + conda environment container container diff --git a/modules/nf-core/software/bwa/index/main.nf b/modules/nf-core/software/bwa/index/main.nf index 91b86cfadf..2dfd30d516 100644 --- a/modules/nf-core/software/bwa/index/main.nf +++ b/modules/nf-core/software/bwa/index/main.nf @@ -13,7 +13,7 @@ process BWA_INDEX { label 'process_high' publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:'') } + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } conda environment container container diff --git a/modules/nf-core/software/bwamem2_index.nf b/modules/nf-core/software/bwamem2_index.nf index 35fe9ac79e..d479fef7ab 100644 --- a/modules/nf-core/software/bwamem2_index.nf +++ b/modules/nf-core/software/bwamem2_index.nf @@ -11,7 +11,7 @@ process BWAMEM2_INDEX { tag "${fasta}" publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:'') } + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } conda environment container container diff --git a/modules/nf-core/software/fastqc.nf b/modules/nf-core/software/fastqc.nf index 6b0ac4f0dc..88985cb75c 100644 --- a/modules/nf-core/software/fastqc.nf +++ b/modules/nf-core/software/fastqc.nf @@ -12,12 +12,8 @@ process FASTQC { label 'process_medium' label 'cpus_2' - publishDir "${params.outdir}/${options.publish_dir}/${meta.sample}/${meta.id}", - mode: params.publish_dir_mode, - saveAs: { filename -> - if (options.publish_results == "none") null - else if (filename.endsWith('.version.txt')) null - else filename } + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } conda environment container container diff --git a/modules/nf-core/software/gatk/applybqsr.nf b/modules/nf-core/software/gatk/applybqsr.nf index 0861b22028..3e6304c6a9 100644 --- a/modules/nf-core/software/gatk/applybqsr.nf +++ b/modules/nf-core/software/gatk/applybqsr.nf @@ -13,6 +13,9 @@ process GATK_APPLYBQSR { tag "${meta.id}-${interval.baseName}" + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + conda environment container container diff --git a/modules/nf-core/software/gatk/baserecalibrator.nf b/modules/nf-core/software/gatk/baserecalibrator.nf index b02ea77b41..48808fdbe5 100644 --- a/modules/nf-core/software/gatk/baserecalibrator.nf +++ b/modules/nf-core/software/gatk/baserecalibrator.nf @@ -12,6 +12,9 @@ process GATK_BASERECALIBRATOR { tag "${meta.id}-${interval.baseName}" + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + conda environment container container diff --git a/modules/nf-core/software/gatk/createsequencedictionary.nf b/modules/nf-core/software/gatk/createsequencedictionary.nf index 0129e33b26..04055858e2 100644 --- a/modules/nf-core/software/gatk/createsequencedictionary.nf +++ b/modules/nf-core/software/gatk/createsequencedictionary.nf @@ -11,7 +11,7 @@ process GATK_CREATESEQUENCEDICTIONARY { tag "${fasta}" publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:'') } + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } conda environment container container diff --git a/modules/nf-core/software/gatk/gatherbqsrreports.nf b/modules/nf-core/software/gatk/gatherbqsrreports.nf index c9686b0a8b..597faf5d50 100644 --- a/modules/nf-core/software/gatk/gatherbqsrreports.nf +++ b/modules/nf-core/software/gatk/gatherbqsrreports.nf @@ -13,10 +13,7 @@ process GATK_GATHERBQSRREPORTS { tag "${meta.id}" publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { - if (it == "${meta.sample}.recal.table" && !params.skip_markduplicates) "preprocessing/${meta.sample}/markduplicates/${it}" - else "preprocessing/${meta.sample}/mapped/${it}" - } + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } conda environment container container diff --git a/modules/nf-core/software/gatk/genotypegvcf.nf b/modules/nf-core/software/gatk/genotypegvcf.nf index 32f3fd5485..9c6f1cdb29 100644 --- a/modules/nf-core/software/gatk/genotypegvcf.nf +++ b/modules/nf-core/software/gatk/genotypegvcf.nf @@ -10,6 +10,9 @@ if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) process GATK_GENOTYPEGVCF { tag "${meta.id}-${interval.baseName}" + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + conda environment container container diff --git a/modules/nf-core/software/gatk/haplotypecaller.nf b/modules/nf-core/software/gatk/haplotypecaller.nf index e0dd960803..025d29026f 100644 --- a/modules/nf-core/software/gatk/haplotypecaller.nf +++ b/modules/nf-core/software/gatk/haplotypecaller.nf @@ -13,6 +13,9 @@ process GATK_HAPLOTYPECALLER { tag "${meta.id}-${interval.baseName}" + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + conda environment container container diff --git a/modules/nf-core/software/gatk/markduplicates.nf b/modules/nf-core/software/gatk/markduplicates.nf index b57ebcc44c..9c2acfae7e 100644 --- a/modules/nf-core/software/gatk/markduplicates.nf +++ b/modules/nf-core/software/gatk/markduplicates.nf @@ -12,10 +12,7 @@ process GATK_MARKDUPLICATES { tag "${meta.id}" publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { - if (it == "${meta.sample}.bam.metrics") "Reports/${meta.sample}/markduplicates/${it}" - else "preprocessing/${meta.sample}/markduplicates/${it}" - } + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } conda environment container container diff --git a/modules/nf-core/software/htslib_tabix.nf b/modules/nf-core/software/htslib_tabix.nf index e1b514f248..25f1c5e873 100644 --- a/modules/nf-core/software/htslib_tabix.nf +++ b/modules/nf-core/software/htslib_tabix.nf @@ -12,7 +12,7 @@ process HTSLIB_TABIX { tag "${vcf}" publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:'') } + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } conda environment container container diff --git a/modules/nf-core/software/qualimap_bamqc.nf b/modules/nf-core/software/qualimap_bamqc.nf index 3f5070e605..25d3715d11 100644 --- a/modules/nf-core/software/qualimap_bamqc.nf +++ b/modules/nf-core/software/qualimap_bamqc.nf @@ -13,7 +13,8 @@ process QUALIMAP_BAMQC { tag "${meta.id}" - publishDir "${params.outdir}/Reports/${meta.id}/bamQC", mode: params.publish_dir_mode + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } conda environment container container diff --git a/modules/nf-core/software/samtools/faidx.nf b/modules/nf-core/software/samtools/faidx.nf index ab08b37c2f..cecf74aa0f 100644 --- a/modules/nf-core/software/samtools/faidx.nf +++ b/modules/nf-core/software/samtools/faidx.nf @@ -11,7 +11,7 @@ process SAMTOOLS_FAIDX { tag "${fasta}" publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:'') } + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } conda environment container container diff --git a/modules/nf-core/software/samtools/index.nf b/modules/nf-core/software/samtools/index.nf index ffb3564b35..0e5c3f11f7 100644 --- a/modules/nf-core/software/samtools/index.nf +++ b/modules/nf-core/software/samtools/index.nf @@ -13,10 +13,7 @@ process SAMTOOLS_INDEX { tag "${meta.id}" publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { filename -> - if (options.publish_results == "none") null - else if (filename.endsWith('.version.txt')) null - else "${options.publish_dir_up}/${meta.sample}/${options.publish_dir_down}/${filename}" } + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } conda environment container container diff --git a/modules/nf-core/software/samtools/stats.nf b/modules/nf-core/software/samtools/stats.nf index aeaa06931d..6302366db5 100644 --- a/modules/nf-core/software/samtools/stats.nf +++ b/modules/nf-core/software/samtools/stats.nf @@ -12,7 +12,8 @@ process SAMTOOLS_STATS { tag "${meta.id}" - publishDir "${params.outdir}/Reports/${meta.id}/SamToolsStats", mode: params.publish_dir_mode + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } conda environment container container diff --git a/modules/nf-core/software/strelka/germline.nf b/modules/nf-core/software/strelka/germline.nf index 5f6d024160..7a97bf2bd6 100644 --- a/modules/nf-core/software/strelka/germline.nf +++ b/modules/nf-core/software/strelka/germline.nf @@ -13,9 +13,8 @@ process STRELKA_GERMLINE { label 'CPUS_MAX' label 'MEMORY_MAX' - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } conda environment container container diff --git a/modules/nf-core/software/trimgalore.nf b/modules/nf-core/software/trimgalore.nf index 7a233bd14c..83f5c6eeb0 100644 --- a/modules/nf-core/software/trimgalore.nf +++ b/modules/nf-core/software/trimgalore.nf @@ -11,12 +11,8 @@ process TRIMGALORE { tag "${meta.id}" label 'process_high' - publishDir "${params.outdir}/${options.publish_dir}", - mode: params.publish_dir_mode, - saveAs: { filename -> - if (options.publish_results == "none") null - else if (filename.endsWith('.version.txt')) null - else filename } + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } conda environment container container From 03bf7712c79ba9119ec489c850571df6c8d911c3 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 20 Oct 2020 15:07:29 +0200 Subject: [PATCH 195/200] chores: update modules with new syntax --- conf/modules.config | 257 ++++-------------- main.nf | 72 ++--- modules/local/process/build_intervals.nf | 4 +- modules/local/process/bwa_mem.nf | 4 +- modules/local/process/bwamem2_mem.nf | 4 +- modules/local/process/concat_vcf.nf | 2 +- modules/local/process/create_intervals_bed.nf | 2 +- modules/local/subworkflow/build_indices.nf | 22 +- .../subworkflow/germline_variant_calling.nf | 10 +- modules/nf-core/software/bwa/index/main.nf | 6 +- modules/nf-core/software/bwamem2_index.nf | 2 +- modules/nf-core/software/fastqc.nf | 4 +- modules/nf-core/software/functions.nf | 8 +- modules/nf-core/software/gatk/applybqsr.nf | 2 +- .../nf-core/software/gatk/baserecalibrator.nf | 2 +- .../software/gatk/createsequencedictionary.nf | 2 +- .../software/gatk/gatherbqsrreports.nf | 1 + modules/nf-core/software/gatk/genotypegvcf.nf | 2 +- .../nf-core/software/gatk/haplotypecaller.nf | 2 +- .../nf-core/software/gatk/markduplicates.nf | 1 + modules/nf-core/software/htslib_tabix.nf | 2 +- modules/nf-core/software/samtools/faidx.nf | 2 +- modules/nf-core/software/trimgalore.nf | 3 +- 23 files changed, 118 insertions(+), 298 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 6664a5deff..617ed9e868 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -8,303 +8,148 @@ params { modules { // BUILD_INDICES 'build_intervals' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" - publish_dir = "reference/bwa" - publish_results = "all" + publish_dir = "reference" + publish_files = "false" } 'bwa_index' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" - publish_dir = "reference/bwa" - publish_results = "all" + publish_dir = "reference" + publish_files = "false" } 'bwamem2_index' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" - publish_dir = "reference/bwamem2" - publish_results = "all" + publish_dir = "reference" + publish_files = "false" } 'create_intervals_bed' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" publish_dir = "reference" - publish_results = "all" - } - 'gatk_dict' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" + publish_files = "false" + } + 'dict' { publish_dir = "reference" - publish_results = "all" + publish_files = "false" } 'samtools_faidx' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" publish_dir = "reference" - publish_results = "none" + publish_files = "false" } 'tabix_dbsnp' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" publish_dir = "reference" - publish_results = "all" + publish_files = "false" } 'tabix_germline_resource' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" publish_dir = "reference" - publish_results = "all" + publish_files = "false" } 'tabix_known_indels' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" publish_dir = "reference" - publish_results = "all" + publish_files = "false" } 'tabix_pon' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" publish_dir = "reference" - publish_results = "all" + publish_files = "false" } // MAPPING 'bwa_mem1_mem' { args = "-K 100000000 -M" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" - publish_dir = "" - publish_results = "none" + publish_files = "false" } 'bwa_mem2_mem' { args = "-K 100000000 -M" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" - publish_dir = "" - publish_results = "none" + publish_files = "false" } 'merge_bam_mapping' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" + publish_by_id = "true" publish_dir = "" - publish_results = "none" } 'qualimap_bamqc_mapping' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" - publish_dir = "qualimap" - publish_results = "all" + publish_by_id = "true" + publish_dir = "reports/qualimap" } 'samtools_index_mapping' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" - publish_dir = "" - publish_dir_up = "preprocessing" - publish_dir_down = "mapped" - publish_results = "none" + publish_by_id = "true" + publish_dir = "mapped" } 'samtools_stats_mapping' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" - publish_dir = "samtools_stats" - publish_results = "none" + publish_by_id = "true" + publish_dir = "reports/samtools_stats" } // MARKDUPLICATES 'markduplicates' { args = "ASSUME_SORTED=true REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT TMP_DIR=tmp" - args2 = "" - extra = "" suffix = ".md" - publish_by_id = "false" - publish_dir = "" - publish_results = "all" + publish_by_id = "true" + publish_dir = "markduplicates" } // PREPARE_RECALIBRATION 'baserecalibrator' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" - publish_dir = "haplotypecaller" - publish_results = "all" + publish_by_id = "true" + publish_dir = "baserecalibrator" } 'gatherbqsrreports' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" - publish_dir = "haplotypecaller" - publish_results = "all" + publish_by_id = "true" + publish_dir = "gatherbqsrreports" } // RECALIBRATE 'applybqsr' { - args = "" - args2 = "" - extra = "" - suffix = "md" - publish_by_id = "false" + suffix = ".recal" + publish_by_id = "true" publish_dir = "" - publish_results = "none" } 'merge_bam_recalibrate' { - args = "" - args2 = "" - extra = "" - suffix = "md" - publish_by_id = "false" + suffix = ".recal" + publish_by_id = "true" publish_dir = "" - publish_results = "none" } 'qualimap_bamqc_recalibrate' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" - publish_dir = "qualimap" - publish_results = "all" + publish_by_id = "true" + publish_dir = "reports/qualimap" } 'samtools_index_recalibrate' { - args = "" - args2 = "" - extra = "" - suffix = "md" - publish_by_id = "false" - publish_dir = "" - publish_dir_up = "preprocessing" - publish_dir_down = "recalibrated" - publish_results = "all" + suffix = ".recal" + publish_by_id = "true" + publish_dir = "recalibrated" } 'samtools_stats_recalibrate' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" - publish_dir = "samtools_stats" - publish_results = "none" + publish_by_id = "true" + publish_dir = "reports/samtools_stats" } // GERMLINE_VARIANT_CALLING 'haplotypecaller' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" + publish_by_id = "true" publish_dir = "haplotypecaller" - publish_results = "none" + publish_files = "false" } 'genotypegvcf' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" + publish_by_id = "true" publish_dir = "haplotypecallergvcf" - publish_results = "none" + publish_files = "false" } 'concat_haplotypecaller' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" + publish_by_id = "true" publish_dir = "haplotypecaller" - publish_results = "all" } 'concat_gvcf' { - args = "" - args2 = "" - extra = "" suffix = ".g" - publish_by_id = "false" + publish_by_id = "true" publish_dir = "haplotypecallergvcf" - publish_results = "all" } 'strelka_germline' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" + publish_by_id = "true" publish_dir = "strelka" - publish_results = "all" } // QC_TRIM 'fastqc' { args = "--quiet" - args = "" - extra = "" - suffix = "" - publish_by_id = "false" - publish_dir = "Reports/FastQC" - publish_results = "all" + publish_by_id = "true" + publish_dir = "reports/fastqc" } 'trimgalore' { args = "--fastqc" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" + publish_by_id = "true" publish_dir = "trimgalore" - publish_results = "all" } // OTHERS 'get_software_versions' { - args = "" - args2 = "" - extra = "" - suffix = "" - publish_by_id = "false" publish_dir = "pipeline_info" - publish_results = "all" } } } diff --git a/main.nf b/main.nf index 305edfe6ca..215400c9c2 100644 --- a/main.nf +++ b/main.nf @@ -105,6 +105,7 @@ if (params.input && (has_extension(params.input, "tsv") || has_extension(params. if (params.input && (has_extension(params.input, "vcf") || has_extension(params.input, "vcf.gz"))) step = "annotate" save_bam_mapped = params.skip_markduplicates ? true : params.save_bam_mapped ? true : false +save_reference = params.save_reference // If no input file specified, trying to get TSV files corresponding to step in the TSV directory // only for steps preparerecalibration, recalibrate, variantcalling and controlfreec @@ -163,7 +164,25 @@ if (tsv_path) { log.info "Trying automatic annotation on files in the VariantCalling/ directory" } else exit 1, 'No sample were defined, see --help' -// input_sample.dump(tag: 'input sample') +/* +================================================================================ + UPDATE MODULES OPTIONS BASED ON PARAMS +================================================================================ +*/ + +modules = params.modules + +if (save_reference) modules['build_intervals'].publish_files = ['bed':'intervals'] +if (save_reference) modules['bwa_index'].publish_files = ['amb':'bwa', 'ann':'bwa', 'bwt':'bwa', 'pac':'bwa', 'sa':'bwa'] +if (save_reference) modules['bwamem2_index'].publish_files = ['0123':'bwamem2', 'amb':'bwamem2', 'ann':'bwamem2', 'bwt.2bit.64':'bwamem2', 'bwt.8bit.32':'bwamem2', 'pac':'bwamem2'] +if (save_reference) modules['create_intervals_bed'].publish_files = ['bed':'intervals'] +if (save_reference) modules['dict'].publish_files = ['dict':'dict'] +if (save_reference) modules['samtools_faidx'].publish_files = ['fai':'fai'] +if (save_reference) modules['tabix_dbsnp'].publish_files = ['vcf.gz.tbi':'dbsnp'] +if (save_reference) modules['tabix_germline_resource'].publish_files = ['vcf.gz.tbi':'germline_resource'] +if (save_reference) modules['tabix_known_indels'].publish_files = ['vcf.gz.tbi':'known_indels'] +if (save_reference) modules['tabix_pon'].publish_files = ['vcf.gz.tbi':'pon'] +if (save_bam_mapped) modules['samtools_index_mapping'].publish_files = ['bam':'mapped', 'bai':'mapped'] /* ================================================================================ @@ -171,9 +190,6 @@ if (tsv_path) { ================================================================================ */ -modules = params.modules.clone() -if (save_bam_mapped) modules['samtools_index_mapping'].publish_results = "all" - // Initialize each params in params.genomes, catch the command line first if it was defined params.ac_loci = params.genome ? params.genomes[params.genome].ac_loci ?: false : false params.ac_loci_gc = params.genome ? params.genomes[params.genome].ac_loci_gc ?: false : false @@ -268,7 +284,7 @@ include { BUILD_INDICES } from './modules/local/subworkflow/build_indices' addPa bwa_index_options: modules['bwa_index'], bwamem2_index_options: modules['bwamem2_index'], create_intervals_bed_options: modules['create_intervals_bed'], - gatk_dict_options: modules['gatk_dict'], + gatk_dict_options: modules['dict'], samtools_faidx_options: modules['samtools_faidx'], tabix_dbsnp_options: modules['tabix_dbsnp'], tabix_germline_resource_options: modules['tabix_germline_resource'], @@ -2100,48 +2116,4 @@ workflow.onComplete { // """ // } -// compressVCFOutVEP = compressVCFOutVEP.dump(tag:'VCF') - -// /* -// ================================================================================ -// MultiQC -// ================================================================================ -// */ - -// // STEP MULTIQC - -// process MultiQC { -// publishDir "${params.outdir}/Reports/MultiQC", mode: params.publish_dir_mode - -// input: -// file (multiqcConfig) from multiqc_config -// file (mqc_custom_config) from multiqc_custom_config.collect().ifEmpty([]) -// file (versions) from ch_software_versions_yaml.collect() -// file workflow_summary from workflow_summary.collectFile(name: "workflow_summary_mqc.yaml") -// file ('bamQC/*') from bamQCReport.collect().ifEmpty([]) -// file ('BCFToolsStats/*') from bcftoolsReport.collect().ifEmpty([]) -// file ('FastQC/*') from fastQCReport.collect().ifEmpty([]) -// file ('TrimmedFastQC/*') from trimGaloreReport.collect().ifEmpty([]) -// file ('MarkDuplicates/*') from duplicates_marked_report.collect().ifEmpty([]) -// file ('DuplicatesMarked/*.recal.table') from baseRecalibratorReport.collect().ifEmpty([]) -// file ('SamToolsStats/*') from samtoolsStatsReport.collect().ifEmpty([]) -// file ('snpEff/*') from snpeffReport.collect().ifEmpty([]) -// file ('VCFTools/*') from vcftoolsReport.collect().ifEmpty([]) - -// output: -// file "*multiqc_report.html" into ch_multiqc_report -// file "*_data" -// file "multiqc_plots" - -// when: !('multiqc' in skip_qc) - -// script: -// rtitle = custom_runName ? "--title \"$custom_runName\"" : '' -// rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' -// custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : '' -// """ -// multiqc -f ${rtitle} ${rfilename} ${custom_config_file} . -// """ -// } - -// ch_multiqc_report.dump(tag:'MultiQC') +// compressVCFOutVEP = compressVCFOutVEP.dump(tag:'VCF') \ No newline at end of file diff --git a/modules/local/process/build_intervals.nf b/modules/local/process/build_intervals.nf index 697e0ec3d9..861c3c07b8 100644 --- a/modules/local/process/build_intervals.nf +++ b/modules/local/process/build_intervals.nf @@ -9,10 +9,10 @@ container = "quay.io/biocontainers/gawk:5.1.0" if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/gawk:5.1.0" process BUILD_INTERVALS { - tag fai + tag "${fai}" publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:"false") } conda environment container container diff --git a/modules/local/process/bwa_mem.nf b/modules/local/process/bwa_mem.nf index 305490de21..bcbd43d9ba 100644 --- a/modules/local/process/bwa_mem.nf +++ b/modules/local/process/bwa_mem.nf @@ -9,10 +9,10 @@ container = "quay.io/biocontainers/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:eabfac3657eda5818bae4090db989e3d41b01542-0" process BWA_MEM { - tag "${meta.id}" - label 'process_high' + tag "${meta.id}" + publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } diff --git a/modules/local/process/bwamem2_mem.nf b/modules/local/process/bwamem2_mem.nf index a25836f36f..f0117b5807 100644 --- a/modules/local/process/bwamem2_mem.nf +++ b/modules/local/process/bwamem2_mem.nf @@ -8,10 +8,10 @@ container = "quay.io/biocontainers/mulled-v2-e5d375990341c5aef3c9aff74f96f66f653 if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/mulled-v2-e5d375990341c5aef3c9aff74f96f66f65375ef6:876eb6f1d38fbf578296ea94e5aede4e317939e7-0" process BWAMEM2_MEM { - tag "${meta.id}" - label 'process_high' + tag "${meta.id}" + publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } diff --git a/modules/local/process/concat_vcf.nf b/modules/local/process/concat_vcf.nf index 35a3af5540..62939cad74 100644 --- a/modules/local/process/concat_vcf.nf +++ b/modules/local/process/concat_vcf.nf @@ -10,7 +10,7 @@ if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) process CONCAT_VCF { label 'cpus_8' - tag "${options.publish_dir}-${meta.id}" + tag "${meta.id}" publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } diff --git a/modules/local/process/create_intervals_bed.nf b/modules/local/process/create_intervals_bed.nf index d90a0a900a..6c6eecac95 100644 --- a/modules/local/process/create_intervals_bed.nf +++ b/modules/local/process/create_intervals_bed.nf @@ -12,7 +12,7 @@ process CREATE_INTERVALS_BED { tag "${intervals}" publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:"false") } conda environment container container diff --git a/modules/local/subworkflow/build_indices.nf b/modules/local/subworkflow/build_indices.nf index 58c057977a..0f4eaba448 100644 --- a/modules/local/subworkflow/build_indices.nf +++ b/modules/local/subworkflow/build_indices.nf @@ -17,16 +17,16 @@ params.tabix_pon_options = [:] // Initialize channels based on params or indices that were just built -include { BUILD_INTERVALS } from '../process/build_intervals.nf' addParams(option: params.build_intervals_options) -include { BWA_INDEX } from '../../nf-core/software/bwa/index/main.nf' addParams(option: params.bwa_index_options) -include { BWAMEM2_INDEX } from '../../nf-core/software/bwamem2_index.nf' addParams(option: params.bwamem2_index_options) -include { CREATE_INTERVALS_BED } from '../process/create_intervals_bed.nf' addParams(option: params.create_intervals_bed_options) -include { GATK_CREATESEQUENCEDICTIONARY as GATK_DICT } from '../../nf-core/software/gatk/createsequencedictionary.nf' addParams(option: params.gatk_dict_options) -include { HTSLIB_TABIX as TABIX_DBSNP } from '../../nf-core/software/htslib_tabix' addParams(option: params.tabix_dbsnp_options) -include { HTSLIB_TABIX as TABIX_GERMLINE_RESOURCE } from '../../nf-core/software/htslib_tabix' addParams(option: params.tabix_germline_resource_options) -include { HTSLIB_TABIX as TABIX_KNOWN_INDELS } from '../../nf-core/software/htslib_tabix' addParams(option: params.tabix_known_indels_options) -include { HTSLIB_TABIX as TABIX_PON } from '../../nf-core/software/htslib_tabix' addParams(option: params.tabix_pon_options) -include { SAMTOOLS_FAIDX } from '../../nf-core/software/samtools/faidx.nf' addParams(option: params.samtools_faidx_options) +include { BUILD_INTERVALS } from '../process/build_intervals.nf' addParams(options: params.build_intervals_options) +include { BWA_INDEX as BWAMEM1_INDEX } from '../../nf-core/software/bwa/index/main.nf' addParams(options: params.bwa_index_options) +include { BWAMEM2_INDEX } from '../../nf-core/software/bwamem2_index.nf' addParams(options: params.bwamem2_index_options) +include { CREATE_INTERVALS_BED } from '../process/create_intervals_bed.nf' addParams(options: params.create_intervals_bed_options) +include { GATK_CREATESEQUENCEDICTIONARY as GATK_DICT } from '../../nf-core/software/gatk/createsequencedictionary.nf' addParams(options: params.gatk_dict_options) +include { HTSLIB_TABIX as TABIX_DBSNP } from '../../nf-core/software/htslib_tabix' addParams(options: params.tabix_dbsnp_options) +include { HTSLIB_TABIX as TABIX_GERMLINE_RESOURCE } from '../../nf-core/software/htslib_tabix' addParams(options: params.tabix_germline_resource_options) +include { HTSLIB_TABIX as TABIX_KNOWN_INDELS } from '../../nf-core/software/htslib_tabix' addParams(options: params.tabix_known_indels_options) +include { HTSLIB_TABIX as TABIX_PON } from '../../nf-core/software/htslib_tabix' addParams(options: params.tabix_pon_options) +include { SAMTOOLS_FAIDX } from '../../nf-core/software/samtools/faidx.nf' addParams(options: params.samtools_faidx_options) workflow BUILD_INDICES{ take: @@ -43,7 +43,7 @@ workflow BUILD_INDICES{ result_bwa = Channel.empty() version_bwa = Channel.empty() if (!(params.bwa) && 'mapping' in step) - if (params.aligner == "bwa-mem") (result_bwa, version_bwa) = BWA_INDEX(fasta) + if (params.aligner == "bwa-mem") (result_bwa, version_bwa) = BWAMEM1_INDEX(fasta) else result_bwa = BWAMEM2_INDEX(fasta) result_dict = Channel.empty() diff --git a/modules/local/subworkflow/germline_variant_calling.nf b/modules/local/subworkflow/germline_variant_calling.nf index 8fcd21b50e..1964b12a61 100644 --- a/modules/local/subworkflow/germline_variant_calling.nf +++ b/modules/local/subworkflow/germline_variant_calling.nf @@ -10,11 +10,11 @@ params.concat_gvcf_options = [:] params.concat_haplotypecaller_options = [:] params.strelka_options = [:] -include { GATK_HAPLOTYPECALLER as HAPLOTYPECALLER } from '../../nf-core/software/gatk/haplotypecaller' addParams(option: params.haplotypecaller_options) -include { GATK_GENOTYPEGVCF as GENOTYPEGVCF } from '../../nf-core/software/gatk/genotypegvcf' addParams(option: params.genotypegvcf_options) -include { CONCAT_VCF as CONCAT_GVCF } from '../process/concat_vcf' addParams(option: params.concat_gvcf_options) -include { CONCAT_VCF as CONCAT_HAPLOTYPECALLER } from '../process/concat_vcf' addParams(option: params.concat_haplotypecaller_options) -include { STRELKA_GERMLINE as STRELKA } from '../../nf-core/software/strelka/germline' addParams(option: params.strelka_options) +include { GATK_HAPLOTYPECALLER as HAPLOTYPECALLER } from '../../nf-core/software/gatk/haplotypecaller' addParams(options: params.haplotypecaller_options) +include { GATK_GENOTYPEGVCF as GENOTYPEGVCF } from '../../nf-core/software/gatk/genotypegvcf' addParams(options: params.genotypegvcf_options) +include { CONCAT_VCF as CONCAT_GVCF } from '../process/concat_vcf' addParams(options: params.concat_gvcf_options) +include { CONCAT_VCF as CONCAT_HAPLOTYPECALLER } from '../process/concat_vcf' addParams(options: params.concat_haplotypecaller_options) +include { STRELKA_GERMLINE as STRELKA } from '../../nf-core/software/strelka/germline' addParams(options: params.strelka_options) workflow GERMLINE_VARIANT_CALLING { take: diff --git a/modules/nf-core/software/bwa/index/main.nf b/modules/nf-core/software/bwa/index/main.nf index 2dfd30d516..078cfb51d1 100644 --- a/modules/nf-core/software/bwa/index/main.nf +++ b/modules/nf-core/software/bwa/index/main.nf @@ -8,12 +8,12 @@ container = "quay.io/biocontainers/bwa:0.7.17--hed695b0_7" if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/bwa:0.7.17--hed695b0_7" process BWA_INDEX { - tag "${fasta}" - label 'process_high' + tag "${fasta}" + publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:"false") } conda environment container container diff --git a/modules/nf-core/software/bwamem2_index.nf b/modules/nf-core/software/bwamem2_index.nf index d479fef7ab..c8e26dfd7e 100644 --- a/modules/nf-core/software/bwamem2_index.nf +++ b/modules/nf-core/software/bwamem2_index.nf @@ -11,7 +11,7 @@ process BWAMEM2_INDEX { tag "${fasta}" publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:"false") } conda environment container container diff --git a/modules/nf-core/software/fastqc.nf b/modules/nf-core/software/fastqc.nf index 88985cb75c..80a5582a1f 100644 --- a/modules/nf-core/software/fastqc.nf +++ b/modules/nf-core/software/fastqc.nf @@ -1,3 +1,4 @@ +// Import generic module functions include { initOptions; saveFiles; getSoftwareName } from './functions' params.options = [:] @@ -8,10 +9,11 @@ container = "quay.io/biocontainers/fastqc:0.11.9--0" if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0" process FASTQC { - tag "${meta.id}" label 'process_medium' label 'cpus_2' + tag "${meta.id}" + publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } diff --git a/modules/nf-core/software/functions.nf b/modules/nf-core/software/functions.nf index d25eea86b3..ca46a99f5d 100644 --- a/modules/nf-core/software/functions.nf +++ b/modules/nf-core/software/functions.nf @@ -41,19 +41,17 @@ def saveFiles(Map args) { if (!args.filename.endsWith('.version.txt')) { def ioptions = initOptions(args.options) def path_list = [ ioptions.publish_dir ?: args.publish_dir ] - if (ioptions.publish_by_id) { - path_list.add(args.publish_id) - } + if (ioptions.publish_by_id) path_list.add(args.publish_id) if (ioptions.publish_files instanceof Map) { for (ext in ioptions.publish_files) { if (args.filename.endsWith(ext.key)) { def ext_list = path_list.collect() ext_list.add(ext.value) - return "${getPathFromList(ext_list)}/$args.filename" + return "${getPathFromList(ext_list)}/${args.filename}" } } } else if (ioptions.publish_files == null) { - return "${getPathFromList(path_list)}/$args.filename" + return "${getPathFromList(path_list)}/${args.filename}" } } } diff --git a/modules/nf-core/software/gatk/applybqsr.nf b/modules/nf-core/software/gatk/applybqsr.nf index 3e6304c6a9..d17bb10dda 100644 --- a/modules/nf-core/software/gatk/applybqsr.nf +++ b/modules/nf-core/software/gatk/applybqsr.nf @@ -11,7 +11,7 @@ process GATK_APPLYBQSR { label 'memory_singleCPU_2_task' label 'cpus_2' - tag "${meta.id}-${interval.baseName}" + tag "${meta.id}" publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } diff --git a/modules/nf-core/software/gatk/baserecalibrator.nf b/modules/nf-core/software/gatk/baserecalibrator.nf index 48808fdbe5..c6b8a35392 100644 --- a/modules/nf-core/software/gatk/baserecalibrator.nf +++ b/modules/nf-core/software/gatk/baserecalibrator.nf @@ -10,7 +10,7 @@ if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) process GATK_BASERECALIBRATOR { label 'cpus_1' - tag "${meta.id}-${interval.baseName}" + tag "${meta.id}" publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } diff --git a/modules/nf-core/software/gatk/createsequencedictionary.nf b/modules/nf-core/software/gatk/createsequencedictionary.nf index 04055858e2..a740b91a64 100644 --- a/modules/nf-core/software/gatk/createsequencedictionary.nf +++ b/modules/nf-core/software/gatk/createsequencedictionary.nf @@ -11,7 +11,7 @@ process GATK_CREATESEQUENCEDICTIONARY { tag "${fasta}" publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:"false") } conda environment container container diff --git a/modules/nf-core/software/gatk/gatherbqsrreports.nf b/modules/nf-core/software/gatk/gatherbqsrreports.nf index 597faf5d50..7bcebc6db4 100644 --- a/modules/nf-core/software/gatk/gatherbqsrreports.nf +++ b/modules/nf-core/software/gatk/gatherbqsrreports.nf @@ -10,6 +10,7 @@ if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) process GATK_GATHERBQSRREPORTS { label 'memory_singleCPU_2_task' label 'cpus_2' + tag "${meta.id}" publishDir params.outdir, mode: params.publish_dir_mode, diff --git a/modules/nf-core/software/gatk/genotypegvcf.nf b/modules/nf-core/software/gatk/genotypegvcf.nf index 9c6f1cdb29..07a009caa9 100644 --- a/modules/nf-core/software/gatk/genotypegvcf.nf +++ b/modules/nf-core/software/gatk/genotypegvcf.nf @@ -8,7 +8,7 @@ container = "quay.io/biocontainers/gatk4-spark:4.1.8.1--0" if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/gatk4-spark:4.1.8.1--0" process GATK_GENOTYPEGVCF { - tag "${meta.id}-${interval.baseName}" + tag "${meta.id}" publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } diff --git a/modules/nf-core/software/gatk/haplotypecaller.nf b/modules/nf-core/software/gatk/haplotypecaller.nf index 025d29026f..fdcc259abc 100644 --- a/modules/nf-core/software/gatk/haplotypecaller.nf +++ b/modules/nf-core/software/gatk/haplotypecaller.nf @@ -11,7 +11,7 @@ process GATK_HAPLOTYPECALLER { label 'MEMORY_SINGLECPU_TASK_SQ' label 'CPUS_2' - tag "${meta.id}-${interval.baseName}" + tag "${meta.id}" publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } diff --git a/modules/nf-core/software/gatk/markduplicates.nf b/modules/nf-core/software/gatk/markduplicates.nf index 9c2acfae7e..91bdfe76c8 100644 --- a/modules/nf-core/software/gatk/markduplicates.nf +++ b/modules/nf-core/software/gatk/markduplicates.nf @@ -9,6 +9,7 @@ if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) process GATK_MARKDUPLICATES { label 'cpus_16' + tag "${meta.id}" publishDir params.outdir, mode: params.publish_dir_mode, diff --git a/modules/nf-core/software/htslib_tabix.nf b/modules/nf-core/software/htslib_tabix.nf index 25f1c5e873..ce133a4b13 100644 --- a/modules/nf-core/software/htslib_tabix.nf +++ b/modules/nf-core/software/htslib_tabix.nf @@ -12,7 +12,7 @@ process HTSLIB_TABIX { tag "${vcf}" publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:"false") } conda environment container container diff --git a/modules/nf-core/software/samtools/faidx.nf b/modules/nf-core/software/samtools/faidx.nf index cecf74aa0f..c60a62518c 100644 --- a/modules/nf-core/software/samtools/faidx.nf +++ b/modules/nf-core/software/samtools/faidx.nf @@ -11,7 +11,7 @@ process SAMTOOLS_FAIDX { tag "${fasta}" publishDir params.outdir, mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:"false") } conda environment container container diff --git a/modules/nf-core/software/trimgalore.nf b/modules/nf-core/software/trimgalore.nf index 83f5c6eeb0..60369d80af 100644 --- a/modules/nf-core/software/trimgalore.nf +++ b/modules/nf-core/software/trimgalore.nf @@ -8,9 +8,10 @@ container = "quay.io/biocontainers/trim-galore:0.6.5--0" if (workflow.containerEngine == 'singularity' && !params.pull_docker_container) container = "https://depot.galaxyproject.org/singularity/trim-galore:0.6.5--0" process TRIMGALORE { - tag "${meta.id}" label 'process_high' + tag "${meta.id}" + publishDir params.outdir, mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:meta.id) } From 02b330d2c36c4db0644c325db7af50263ed63031 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 20 Oct 2020 15:24:20 +0200 Subject: [PATCH 196/200] fix: --step prepare_recalibration --skip_markduplicates --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 617ed9e868..4a0110ac81 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -66,7 +66,7 @@ params { } 'samtools_index_mapping' { publish_by_id = "true" - publish_dir = "mapped" + publish_dir = "preprocessing" } 'samtools_stats_mapping' { publish_by_id = "true" From c9e00f56145d2045a45898d6363ea2dbbc8871dd Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 20 Oct 2020 16:08:00 +0200 Subject: [PATCH 197/200] fix: --step recalibrate --skip_markduplicates --- conf/modules.config | 31 ++++++++++++++++--------------- main.nf | 25 +++++++++++++------------ 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 4a0110ac81..85772bb2fa 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -58,7 +58,7 @@ params { } 'merge_bam_mapping' { publish_by_id = "true" - publish_dir = "" + publish_dir = "preprocessing" } 'qualimap_bamqc_mapping' { publish_by_id = "true" @@ -77,36 +77,37 @@ params { args = "ASSUME_SORTED=true REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT TMP_DIR=tmp" suffix = ".md" publish_by_id = "true" - publish_dir = "markduplicates" + publish_dir = "preprocessing" } // PREPARE_RECALIBRATION 'baserecalibrator' { publish_by_id = "true" - publish_dir = "baserecalibrator" + publish_dir = "preprocessing" + publish_files = "false" } 'gatherbqsrreports' { publish_by_id = "true" - publish_dir = "gatherbqsrreports" + publish_dir = "preprocessing" } // RECALIBRATE 'applybqsr' { - suffix = ".recal" + suffix = "recal" publish_by_id = "true" - publish_dir = "" + publish_dir = "preprocessing" } 'merge_bam_recalibrate' { - suffix = ".recal" + suffix = "recal" publish_by_id = "true" - publish_dir = "" + publish_dir = "preprocessing" } 'qualimap_bamqc_recalibrate' { publish_by_id = "true" publish_dir = "reports/qualimap" } 'samtools_index_recalibrate' { - suffix = ".recal" + suffix = "recal" publish_by_id = "true" - publish_dir = "recalibrated" + publish_dir = "preprocessing" } 'samtools_stats_recalibrate' { publish_by_id = "true" @@ -115,26 +116,26 @@ params { // GERMLINE_VARIANT_CALLING 'haplotypecaller' { publish_by_id = "true" - publish_dir = "haplotypecaller" + publish_dir = "variant_calling" publish_files = "false" } 'genotypegvcf' { publish_by_id = "true" - publish_dir = "haplotypecallergvcf" + publish_dir = "variant_calling" publish_files = "false" } 'concat_haplotypecaller' { publish_by_id = "true" - publish_dir = "haplotypecaller" + publish_dir = "variant_calling" } 'concat_gvcf' { suffix = ".g" publish_by_id = "true" - publish_dir = "haplotypecallergvcf" + publish_dir = "variant_calling" } 'strelka_germline' { publish_by_id = "true" - publish_dir = "strelka" + publish_dir = "variant_calling" } // QC_TRIM 'fastqc' { diff --git a/main.nf b/main.nf index 215400c9c2..8a16d24046 100644 --- a/main.nf +++ b/main.nf @@ -105,7 +105,6 @@ if (params.input && (has_extension(params.input, "tsv") || has_extension(params. if (params.input && (has_extension(params.input, "vcf") || has_extension(params.input, "vcf.gz"))) step = "annotate" save_bam_mapped = params.skip_markduplicates ? true : params.save_bam_mapped ? true : false -save_reference = params.save_reference // If no input file specified, trying to get TSV files corresponding to step in the TSV directory // only for steps preparerecalibration, recalibrate, variantcalling and controlfreec @@ -172,17 +171,19 @@ if (tsv_path) { modules = params.modules -if (save_reference) modules['build_intervals'].publish_files = ['bed':'intervals'] -if (save_reference) modules['bwa_index'].publish_files = ['amb':'bwa', 'ann':'bwa', 'bwt':'bwa', 'pac':'bwa', 'sa':'bwa'] -if (save_reference) modules['bwamem2_index'].publish_files = ['0123':'bwamem2', 'amb':'bwamem2', 'ann':'bwamem2', 'bwt.2bit.64':'bwamem2', 'bwt.8bit.32':'bwamem2', 'pac':'bwamem2'] -if (save_reference) modules['create_intervals_bed'].publish_files = ['bed':'intervals'] -if (save_reference) modules['dict'].publish_files = ['dict':'dict'] -if (save_reference) modules['samtools_faidx'].publish_files = ['fai':'fai'] -if (save_reference) modules['tabix_dbsnp'].publish_files = ['vcf.gz.tbi':'dbsnp'] -if (save_reference) modules['tabix_germline_resource'].publish_files = ['vcf.gz.tbi':'germline_resource'] -if (save_reference) modules['tabix_known_indels'].publish_files = ['vcf.gz.tbi':'known_indels'] -if (save_reference) modules['tabix_pon'].publish_files = ['vcf.gz.tbi':'pon'] -if (save_bam_mapped) modules['samtools_index_mapping'].publish_files = ['bam':'mapped', 'bai':'mapped'] +if (params.save_reference) modules['build_intervals'].publish_files = ['bed':'intervals'] +if (params.save_reference) modules['bwa_index'].publish_files = ['amb':'bwa', 'ann':'bwa', 'bwt':'bwa', 'pac':'bwa', 'sa':'bwa'] +if (params.save_reference) modules['bwamem2_index'].publish_files = ['0123':'bwamem2', 'amb':'bwamem2', 'ann':'bwamem2', 'bwt.2bit.64':'bwamem2', 'bwt.8bit.32':'bwamem2', 'pac':'bwamem2'] +if (params.save_reference) modules['create_intervals_bed'].publish_files = ['bed':'intervals'] +if (params.save_reference) modules['dict'].publish_files = ['dict':'dict'] +if (params.save_reference) modules['samtools_faidx'].publish_files = ['fai':'fai'] +if (params.save_reference) modules['tabix_dbsnp'].publish_files = ['vcf.gz.tbi':'dbsnp'] +if (params.save_reference) modules['tabix_germline_resource'].publish_files = ['vcf.gz.tbi':'germline_resource'] +if (params.save_reference) modules['tabix_known_indels'].publish_files = ['vcf.gz.tbi':'known_indels'] +if (params.save_reference) modules['tabix_pon'].publish_files = ['vcf.gz.tbi':'pon'] +if (save_bam_mapped) modules['samtools_index_mapping'].publish_files = ['bam':'mapped', 'bai':'mapped'] +if (params.skip_markduplicates) modules['baserecalibrator'].publish_files = ['recal.table':'mapped'] +if (params.skip_markduplicates) modules['gatherbqsrreports'].publish_files = ['recal.table':'mapped'] /* ================================================================================ From da138f1555a47c8ca072ffccb6710fe27b8841cb Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 20 Oct 2020 16:31:56 +0200 Subject: [PATCH 198/200] fix: --step prepare_recalibration --- conf/modules.config | 1 + modules/local/subworkflow/markduplicates.nf | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 85772bb2fa..a6379483fd 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -78,6 +78,7 @@ params { suffix = ".md" publish_by_id = "true" publish_dir = "preprocessing" + publish_files = ['md.bam': 'markduplicates', 'md.bam.bai': 'markduplicates'] } // PREPARE_RECALIBRATION 'baserecalibrator' { diff --git a/modules/local/subworkflow/markduplicates.nf b/modules/local/subworkflow/markduplicates.nf index b0cf70b702..531e11b464 100644 --- a/modules/local/subworkflow/markduplicates.nf +++ b/modules/local/subworkflow/markduplicates.nf @@ -4,9 +4,9 @@ ================================================================================ */ -params.gatk_markduplicates_options = [:] +params.markduplicates_options = [:] -include { GATK_MARKDUPLICATES } from '../../nf-core/software/gatk/markduplicates' addParams(options: params.gatk_markduplicates_options) +include { GATK_MARKDUPLICATES } from '../../nf-core/software/gatk/markduplicates' addParams(options: params.markduplicates_options) workflow MARKDUPLICATES { take: From 937c6e138f393afbc30684b899d757d273f9fd37 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 20 Oct 2020 16:44:58 +0200 Subject: [PATCH 199/200] fix: --step recalibrate --- conf/modules.config | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index a6379483fd..74979b37ec 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -84,11 +84,12 @@ params { 'baserecalibrator' { publish_by_id = "true" publish_dir = "preprocessing" - publish_files = "false" + publish_files = ['recal.table':'markduplicates'] } 'gatherbqsrreports' { publish_by_id = "true" publish_dir = "preprocessing" + publish_files = ['recal.table':'markduplicates'] } // RECALIBRATE 'applybqsr' { From 59e0a03a32e52e022f4df7644a763256cb243078 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 20 Oct 2020 17:14:09 +0200 Subject: [PATCH 200/200] fix: --step variant_calling --- conf/modules.config | 7 ++++++- modules/local/process/concat_vcf.nf | 2 +- modules/local/subworkflow/recalibrate.nf | 8 ++++---- modules/nf-core/software/strelka/germline.nf | 14 +++++++------- 4 files changed, 18 insertions(+), 13 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 74979b37ec..cb73ab1d10 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -110,6 +110,7 @@ params { suffix = "recal" publish_by_id = "true" publish_dir = "preprocessing" + publish_files = ['recal.bam':'recalibrated', 'recal.bam.bai':'recalibrated'] } 'samtools_stats_recalibrate' { publish_by_id = "true" @@ -127,17 +128,21 @@ params { publish_files = "false" } 'concat_haplotypecaller' { + suffix = "haplotypecaller" publish_by_id = "true" publish_dir = "variant_calling" + publish_files = ['vcf.gz':'haplotypecaller', 'vcf.gz.tbi':'haplotypecaller'] } 'concat_gvcf' { - suffix = ".g" + suffix = "haplotypecaller_gvcf" publish_by_id = "true" publish_dir = "variant_calling" + publish_files = ['vcf.gz':'haplotypecaller_gvcf', 'vcf.gz.tbi':'haplotypecaller_gvcf'] } 'strelka_germline' { publish_by_id = "true" publish_dir = "variant_calling" + publish_files = ['vcf.gz':'strelka', 'vcf.gz.tbi':'strelka'] } // QC_TRIM 'fastqc' { diff --git a/modules/local/process/concat_vcf.nf b/modules/local/process/concat_vcf.nf index 62939cad74..aa1bc5387b 100644 --- a/modules/local/process/concat_vcf.nf +++ b/modules/local/process/concat_vcf.nf @@ -27,7 +27,7 @@ process CONCAT_VCF { tuple val(meta), path("*_*.vcf.gz"), path("*_*.vcf.gz.tbi"), emit: vcf script: - name = options.suffix ? "${options.publish_dir}_${meta.id}${options.suffix}" : "${options.publish_dir}_${meta.id}" + name = options.suffix ? "${options.suffix}_${meta.id}" : "${meta.id}" target_options = params.target_bed ? "-t ${bed}" : "" interval_options = params.no_intervals ? "-n" : "" """ diff --git a/modules/local/subworkflow/recalibrate.nf b/modules/local/subworkflow/recalibrate.nf index 2f62a6e562..5551199799 100644 --- a/modules/local/subworkflow/recalibrate.nf +++ b/modules/local/subworkflow/recalibrate.nf @@ -103,8 +103,8 @@ workflow RECALIBRATE { sample = meta.sample gender = meta.gender status = meta.status - bam = "${params.outdir}/preprocessing/${sample}/recalibrated/${sample}.md.bam" - bai = "${params.outdir}/preprocessing/${sample}/recalibrated/${sample}.md.bam.bai" + bam = "${params.outdir}/preprocessing/${sample}/recalibrated/${sample}.recal.bam" + bai = "${params.outdir}/preprocessing/${sample}/recalibrated/${sample}.recal.bam.bai" ["recalibrated_${sample}.tsv", "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n"] } @@ -113,8 +113,8 @@ workflow RECALIBRATE { sample = meta.sample gender = meta.gender status = meta.status - bam = "${params.outdir}/preprocessing/${sample}/recalibrated/${sample}.md.bam" - bai = "${params.outdir}/preprocessing/${sample}/recalibrated/${sample}.md.bam.bai" + bam = "${params.outdir}/preprocessing/${sample}/recalibrated/${sample}.recal.bam" + bai = "${params.outdir}/preprocessing/${sample}/recalibrated/${sample}.recal.bam.bai" "${patient}\t${gender}\t${status}\t${sample}\t${bam}\t${bai}\n" }.collectFile(name: 'recalibrated.tsv', sort: true, storeDir: "${params.outdir}/preprocessing/tsv") } diff --git a/modules/nf-core/software/strelka/germline.nf b/modules/nf-core/software/strelka/germline.nf index 7a97bf2bd6..278af2adff 100644 --- a/modules/nf-core/software/strelka/germline.nf +++ b/modules/nf-core/software/strelka/germline.nf @@ -33,7 +33,7 @@ process STRELKA_GERMLINE { script: def software = getSoftwareName(task.process) def ioptions = initOptions(options) - def prefix = ioptions.suffix ? "Strelka_${meta.id}" : "Strelka_${meta.id}" + def prefix = ioptions.suffix ? "strelka_${meta.id}${ioptions.suffix}" : "strelka_${meta.id}" // TODO nf-core: It MUST be possible to pass additional parameters to the tool as a command-line string via the "$ioptions.args" variable // TODO nf-core: If the tool supports multi-threading then you MUST provide the appropriate parameter // using the Nextflow "task" variable e.g. "--threads $task.cpus" @@ -45,17 +45,17 @@ process STRELKA_GERMLINE { --bam ${bam} \ --referenceFasta ${fasta} \ ${options_strelka} \ - --runDir Strelka + --runDir strelka - python Strelka/runWorkflow.py -m local -j ${task.cpus} + python strelka/runWorkflow.py -m local -j ${task.cpus} - mv Strelka/results/variants/genome.*.vcf.gz ${prefix}_genome.vcf.gz + mv strelka/results/variants/genome.*.vcf.gz ${prefix}_genome.vcf.gz - mv Strelka/results/variants/genome.*.vcf.gz.tbi ${prefix}_genome.vcf.gz.tbi + mv strelka/results/variants/genome.*.vcf.gz.tbi ${prefix}_genome.vcf.gz.tbi - mv Strelka/results/variants/variants.vcf.gz ${prefix}_variants.vcf.gz + mv strelka/results/variants/variants.vcf.gz ${prefix}_variants.vcf.gz - mv Strelka/results/variants/variants.vcf.gz.tbi ${prefix}_variants.vcf.gz.tbi + mv strelka/results/variants/variants.vcf.gz.tbi ${prefix}_variants.vcf.gz.tbi echo configureStrelkaGermlineWorkflow.py --version &> ${software}.version.txt #2>&1 """