config.yaml

## Important note:
## All paths defined in this configuration file must be either absolute or relative to the 
## location of the Snakefile!

## Reference annotation details
##--------------------------------------------------------------------------------------------
## Specify "Ensembl" or "Gencode" depending on your choice
annotation: Ensembl

organism: Homo_sapiens # separate with underscore
build: GRCh38
release: 93
##--------------------------------------------------------------------------------------------


## Paths to existing reference files 
##--------------------------------------------------------------------------------------------
txome: example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.cdna.all.1.1.10M.fa.gz
genome: example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.dna.chromosome.1.1.10M.fa
gtf: example_data/reference/Ensembl.GRCh38.93/Homo_sapiens.GRCh38.93.1.1.10M.gtf
##--------------------------------------------------------------------------------------------


## Paths to indexes that will be generated by the workflow
##--------------------------------------------------------------------------------------------
salmonindex: example_data/reference/SalmonIndex/Homo_sapiens.GRCh38.93.sidx
#salmonk: 31
STARindex: example_data/reference/STARIndex/Homo_sapiens.GRCh38.93.STAR.idx
##--------------------------------------------------------------------------------------------

## Additional STAR parameters
## Here, you can specify any optional parameters for the index building and/or alignment 
## with STAR. The following arguments are automatically populated and should NOT be 
## specified here:
## Indexing: runMode, runThreadN, genomeDir, genomeFastaFiles, sjdbGTFfile, sjdbOverhang
## Alignment: runMode, genomeDir, readFilesIn, runThreadN, outFileNamePrefix, outSAMtype, readFilesCommand
##--------------------------------------------------------------------------------------------
## Add or remove parameters inside the ""
additional_star_index: ""
additional_star_align: ""

## Additional Salmon parameters
## Here, you can specify any optional parameters for the index building and/or 
## abundance quantification with Salmon. The following arguments are automatically populated 
## based on the arguments specified elsewhere, and should NOT be specified here:
## Indexing: transcriptome input file, index directory, gencode flag
## Quantification: library type, fastq files, index directory, output directory, number of cores
##--------------------------------------------------------------------------------------------
## Add or remove parameters inside the ""
additional_salmon_index: "-k 31"

## Add or remove parameters inside the ""
## We specify the mean and standard deviation of the fragment length distribution, for use with Salmon. 
## This is important to specify for single-end reads.
## For paired-end reads, these values will define the prior, which is then updated 
## based on the observed fragment lengths.
additional_salmon_quant: "--seqBias --gcBias --fldMean 250 --fldSD 25"
##--------------------------------------------------------------------------------------------


## Information about the experiment
##--------------------------------------------------------------------------------------------
readlength: 63

## Path to metadata text file. This file must contain at least the following columns:
## names: the sample identifiers = the names of the FASTQ files (excluding the _R1/R2.fastq.gz part)
## type: either SE or PE, indicating whether the sample was analyzed 
## via single-end or paired-end sequencing.
metatxt: example_data/metadata.txt

## Variables used for model fitting
## design: design formula for use with edgeR, camera and DRIMSeq. Must be a string 
## of the form "~ <predictors>"
## contrast: (comma-separated if multiple) list of contrasts to estimate in edgeR_dge.Rmd
design: "~ 0 + celline"
contrast: cellineN61311-cellineN052611,cellineN052611-cellineN61311

## Gene sets used for gene set analysis with camera
## Comma-separated list of gene set categories to test with camera. 
## Must be a subset of H,C1,C2,C3,C4,C5,C6,C7
## Only required if variable "run_camera: is True (see below).
genesets: H,C5

## The maximal number of cores to use for FastQC, STAR, Salmon and DRIMSeq.
## Note that the actual number of cores available to Snakemake is determined by
## the --cores argument when it is invoked.
ncores: 1
##---------------------------------------------------------------------------------------------


## Path to a folder containing gzipped fastq files, and the file suffix (typically, either fastq or fq). 
## If you have paired-end fastq files, you also need to define the extension distinguishing the two read files. 
## More precisely, ARMOR assumes that paired-end fastq files are named 
## <sample-name>_<fqext1>.<fqsuffix>.gz and <sample-name>_<fqext2>.<fqsuffix>.gz.
## Single-end fastq files are supposed to be named 
## <sample-name>.<fqsuffix>.gz.
##---------------------------------------------------------------------------------------------
FASTQ: example_data/FASTQ
fqext1: R1
fqext2: R2
fqsuffix: fastq
##---------------------------------------------------------------------------------------------


## Path to a folder that will store the output generated by the workflow. 
## Additional subfolders of this folder will be generated by the workflow. 
## To put output in the current directory, set output to ".".
##---------------------------------------------------------------------------------------------
output: example_data/output
##---------------------------------------------------------------------------------------------

## R setup
##---------------------------------------------------------------------------------------------
## Specify "True" if R should be installed in a conda environment or "False" if you want to use 
## your own R installation (then you have to set the path to your library in the .Renviron file)
useCondaR: True
Rbin: R
##---------------------------------------------------------------------------------------------

## Conditional conda rules
##---------------------------------------------------------------------------------------------
## Should read trimming, STAR mapping, DRIMSeq analysis and gene set analysis be performed? Set
## to False if the step is not required.
run_trimming: True
run_STAR: True
run_DRIMSeq: True
run_camera: True
##---------------------------------------------------------------------------------------------