nextflow.config

params {

  // -------------------------
  // Input/Output directories
  // -------------------------
  input_dir    = "$baseDir/input/"
  fastq_dir    = "${params.input_dir}/fastq/"
  outdir       = "$baseDir/results"

  // pattern to match for fastq files
  fastq_pattern         = "*_R[12]_*.fastq*"

  initial_fastqc_dir    = "${params.outdir}/initial_fastqc/"
  post_trim_fastqc_dir  = "${params.outdir}/post_trim_fastqc/"
  host_filtered_out_dir = "${params.outdir}/host_filtered_fastq/"
  contigs_out_dir       = "${params.outdir}/contigs/"
  blast_out_dir         = "${params.outdir}/blast_output/"
  tally_out_dir         = "${params.outdir}/tallies/"
  virus_seq_out_dir     = "${params.outdir}/virus_sequences/"
  counts_out_dir        = "${params.outdir}/fastq_counts/"
  fastq_out_dir         = "${params.outdir}/trimmed_fastq/"
  bam_out_dir           = "${params.outdir}/bam/"

  // reports on running the pipeline itself
  tracedir = "${outdir}/pipeline_info"

  // where are R and shell scripts are found.
  R_bindir         = "${baseDir}/scripts"
  scripts_bindir   = "${baseDir}/scripts"

  // ------------------
  // Trimming settings 
  // ------------------
  always_trim_5p_bases = "0"
  always_trim_3p_bases = "1"
  // if you have very short library molecules may want to change this
  post_trim_min_length = "30"

  // cd-hit-dup cutoff for collapsing reads with >= this much fractional similarity
  mismatches_allowed = "2"

  // --------------------
  // Host cell filtering 
  // --------------------
  // Define one of the 2 following parameters:
  //
  // 1. A 2-column tab-delimited file with:
  //    - the first column defining dataset IDs or patterns that will
  //      match dataset IDs
  //    - the second column will be the path of a bowtie index that will be 
  //      used to filter out host reads
  //
  //    This enables different filtering for different datasets                   
  host_map_file = ""
                                                                                
  // 2. The path to a bowtie index that will be used to filter host reads
  //    for all datasets
  // 
  // params.host_bt_index = "/home/databases/fly/combined_fly_index"
  host_bt_index = ""
                                                                                
  // min bowtie alignment score to be considered a host-derived read
  host_bt_min_score = "60"

  // -------------------------
  // BLAST and classification
  // -------------------------
                                                                                
  // minimum length of contigs to keep for further analysis
  minimum_contig_length = 40

  // classify singletons (reads that don't map to contigs) in addition to just contigs?
  // classifying singletons is slower but more thorough
  classify_singletons = true
                                                                                
  // Blast e-value cutoffs
  max_blast_nt_evalue = "1e-10"
  max_blasx_nr_evalue = "1e-3"
                                                                                
  blast_db_dir = "/home/databases/nr_nt/"
  nt_blast_db = "${params.blast_db_dir}/nt"
  nr_blast_db = "${params.blast_db_dir}/nr"
  nr_diamond_db = "${params.blast_db_dir}/nr.dmnd"

  // singularity_pull_docker_container option
  //
  // turn this parameter on to pull docker containers and convert to singularity
  //
  // see e.g.: https://nf-co.re/gwas#quick-start, which states:
  //
  //   "If you are persistently observing issues downloading Singularity images directly
  //    due to timeout or network issues then please use the --singularity_pull_docker_container
  //    parameter to pull and convert the Docker image instead."
  //
  // TODO: this option is provided in nf-core pipelines but is it necessary?
  //       possibly remove this option and the corresponding if/else statment in processes?
  //

  singularity_pull_docker_container = false

}

process {

  // ------------------------------------------------------------
  // setup resource usage limits for different types of processes
  // ------------------------------------------------------------

  // high memory process like blastn (using nt database)
  withLabel: 'highmem' {
    maxForks = 2
    cpus = 24
  }

  // low memory processes that use multi-threading
  // like bowtie2
  withLabel: 'lowmem_threaded' {
    maxForks = 6
    cpus = 8
  }

  // low memory processes that don't use multi-threading
  withLabel: 'lowmem_non_threaded' {
    maxForks = 24
    cpus = 1
  }
}

/*
   Profiles allow you to run on different servers or with different base configurations

   See: https://www.nextflow.io/docs/latest/config.html#config-profiles
*/
profiles {

  local {
    exector.name = 'local'
    executor.queueSize = 24
    executor.cpus = 48
    executor.memory = '256 GB'
    // if the pipeline has to access system paths outside of $HOME, $PWD, etc 
    // have to bind those paths to singularity.
    // see: https://sylabs.io/guides/latest/user-guide/bind_paths_and_mounts.html
    // in this profile, we are pointing to local intallations of NCBI databases 
    //so need to access those paths
    singularity.runOptions = "--bind /home/databases"
    params.local_nt_database ="/home/databases/nr_nt/nt"
    params.local_diamond_database ="/home/databases/nr_nt/nr.dmnd"
    params.remote_blast = false
  }

  conda {
    params.enable_conda    = true
    process.conda          = "./conda/taxonomy_conda_environment.yaml"
    singularity.enabled    = false
    conda.cacheDir         = "$HOME/conda_cacheDir"
  }

  singularity {
    params.enable_conda    = false
    singularity.enabled    = true
    singularity.autoMounts = true
    singularity.cacheDir   = "$HOME/singularity_cacheDir"
    // singularity.runOptions = "-B /home/databases"
  }

  test {
    includeConfig 'conf/test.config'
  }
}

def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss')
timeline {
    enabled = true
    file    = "${params.tracedir}/execution_timeline_${trace_timestamp}.html"
}
report {
    enabled = true
    file    = "${params.tracedir}/execution_report_${trace_timestamp}.html"
}
trace {
    enabled = true
    file    = "${params.tracedir}/execution_trace_${trace_timestamp}.txt"
}
dag {
    enabled = true
    file    = "${params.tracedir}/pipeline_dag_${trace_timestamp}.pdf"
}

manifest {
    name            = 'stenglein-lab/taxonomy'
    author          = 'Mark Stenglein'
    homePage        = 'https://github.com/stenglein-lab/taxonomy'
    description     = 'A pipeline to taxonomically classify sequences from Illumina datasets'
    mainScript      = 'main.nf'
    nextflowVersion = '!>=21.04.0'
    version         = '1.0'
}


// Turn this option on to delete all intermediate files from the analysis
// see: https://www.nextflow.io/docs/latest/config.html
// cleanup = true