Skip to content
This repository has been archived by the owner on Jan 27, 2020. It is now read-only.

Commit

Permalink
Merge pull request #697 from MaxUlysse/iGenomes
Browse files Browse the repository at this point in the history
iGenomes specific config file
  • Loading branch information
alneberg authored Dec 11, 2018
2 parents 347bebb + d4b3a1d commit df83874
Show file tree
Hide file tree
Showing 7 changed files with 94 additions and 42 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- [#671](https://github.com/SciLifeLab/Sarek/pull/671) - New `publishDirMode` param and docs
- [#673](https://github.com/SciLifeLab/Sarek/pull/673), [#675](https://github.com/SciLifeLab/Sarek/pull/675), [#676](https://github.com/SciLifeLab/Sarek/pull/676) - Profiles for BinAC and CFC clusters in Tübingen
- [#679](https://github.com/SciLifeLab/Sarek/pull/679) - Add container for `CreateIntervalBeds`
- [#692](https://github.com/SciLifeLab/Sarek/pull/692) - Add AWS iGenomes possibilities (currently under `iGRCh37` and `iGRCh38`)
- [#692](https://github.com/SciLifeLab/Sarek/pull/692), [#697](https://github.com/SciLifeLab/Sarek/pull/697) - Add AWS iGenomes possibilities (within `conf/igenomes.conf`)
- [#694](https://github.com/SciLifeLab/Sarek/pull/694) - Add monochrome and grey logos for light or dark background

### `Changed`
Expand Down
2 changes: 1 addition & 1 deletion conf/aws-batch.config
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
*/

params {
genome_base = params.genome == 'iGRCh37' ? "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh37" : params.genome == 'iGRCh38' ? "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38" : "s3://sarek-references/small"
genome_base = params.genome == 'GRCh37' ? "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh37" : params.genome == 'GRCh38' ? "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38" : "s3://sarek-references/small"
publishDirMode = 'copy'
}

Expand Down
1 change: 0 additions & 1 deletion conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
* -------------------------------------------------
*/

includeConfig 'genomes.config'
wf_repository = 'maxulysse'

params {
Expand Down
32 changes: 4 additions & 28 deletions conf/genomes.config
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
* -------------------------------------------------
* Path to reference files
* -------------------------------------------------
* Imported under all Nextflow profiles in
* Imported under Nextflow profiles in
* nextflow.config
* -------------------------------------------------
* Modify to add specific versions of genomes
* Defines reference genomes, using paths
* Can be used by any config that customises the base
* path using $params.genome_base / --genome_base
* -------------------------------------------------
*/

Expand Down Expand Up @@ -42,32 +44,6 @@ params {
//AF_files = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf"
//AF_indexes = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf.idx"
}
'iGRCh37' {
acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_20130502_SNP_maf0.3.loci"
dbsnp = "${params.genome_base}/Annotation/GATKBundle/dbsnp_138.b37.vcf"
dbsnpIndex = "${params.genome_base}/Annotation/GATKBundle/dbsnp_138.b37.vcf.idx"
genomeFile = "${params.genome_base}/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta"
genomeDict = "${params.genome_base}/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.dict"
genomeIndex = "${params.genome_base}/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta.fai"
bwaIndex = "${params.genome_base}/Sequence/BWAIndex/human_g1k_v37_decoy.fasta.{amb,ann,bwt,pac,sa}"
intervals = "${params.genome_base}/Annotation/intervals/wgs_calling_regions_CAW.list"
knownIndels = "${params.genome_base}/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf"
knownIndelsIndex = "${params.genome_base}/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.idx"
snpeffDb = "GRCh37.75"
}
'iGRCh38' {
acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_GRCh38_maf0.3.loci"
dbsnp = "${params.genome_base}/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz"
dbsnpIndex = "${params.genome_base}/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz.tbi"
genomeFile = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta"
genomeDict = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.dict"
genomeIndex = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta.fai"
bwaIndex = "${params.genome_base}/Sequence/BWAIndex/Homo_sapiens_assembly38.fasta.64.{alt,amb,ann,bwt,pac,sa}"
intervals = "${params.genome_base}/Annotation/intervals/wgs_calling_regions.hg38.bed"
knownIndels = "${params.genome_base}/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz"
knownIndelsIndex = "${params.genome_base}/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi"
snpeffDb = "GRCh38.86"
}
'smallGRCh37' {
acLoci = "${params.genome_base}/1000G_phase3_20130502_SNP_maf0.3.small.loci"
dbsnp = "${params.genome_base}/dbsnp_138.b37.small.vcf"
Expand Down
58 changes: 58 additions & 0 deletions conf/igenomes.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*
* -------------------------------------------------
* Nextflow config file for Sarek
* -------------------------------------------------
* Path to iGenomes reference files
* -------------------------------------------------
* Imported under Nextflow profiles in
* nextflow.config
* -------------------------------------------------
* Defines reference genomes, using iGenome paths
* Can be used by any config that customises the base
* path using $params.genome_base / --genome_base
* -------------------------------------------------
*/

params {
genomes {
'GRCh37' {
acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_20130502_SNP_maf0.3.loci"
dbsnp = "${params.genome_base}/Annotation/GATKBundle/dbsnp_138.b37.vcf"
dbsnpIndex = "${params.genome_base}/Annotation/GATKBundle/dbsnp_138.b37.vcf.idx"
genomeFile = "${params.genome_base}/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta"
genomeDict = "${params.genome_base}/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.dict"
genomeIndex = "${params.genome_base}/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta.fai"
bwaIndex = "${params.genome_base}/Sequence/BWAIndex/human_g1k_v37_decoy.fasta.{amb,ann,bwt,pac,sa}"
intervals = "${params.genome_base}/Annotation/intervals/wgs_calling_regions_CAW.list"
knownIndels = "${params.genome_base}/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf"
knownIndelsIndex = "${params.genome_base}/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.idx"
snpeffDb = "GRCh37.75"
}
'GRCh38' {
acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_GRCh38_maf0.3.loci"
dbsnp = "${params.genome_base}/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz"
dbsnpIndex = "${params.genome_base}/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz.tbi"
genomeFile = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta"
genomeDict = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.dict"
genomeIndex = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta.fai"
bwaIndex = "${params.genome_base}/Sequence/BWAIndex/Homo_sapiens_assembly38.fasta.64.{alt,amb,ann,bwt,pac,sa}"
intervals = "${params.genome_base}/Annotation/intervals/wgs_calling_regions.hg38.bed"
knownIndels = "${params.genome_base}/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz"
knownIndelsIndex = "${params.genome_base}/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi"
snpeffDb = "GRCh38.86"
}
'smallGRCh37' {
acLoci = "${params.genome_base}/1000G_phase3_20130502_SNP_maf0.3.small.loci"
dbsnp = "${params.genome_base}/dbsnp_138.b37.small.vcf"
dbsnpIndex = "${dbsnp}.idx"
genomeFile = "${params.genome_base}/human_g1k_v37_decoy.small.fasta"
bwaIndex = "${genomeFile}.{amb,ann,bwt,pac,sa}"
genomeDict = "${params.genome_base}/human_g1k_v37_decoy.small.dict"
genomeIndex = "${genomeFile}.fai"
intervals = "${params.genome_base}/small.intervals"
knownIndels = "${params.genome_base}/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.small.vcf"
knownIndelsIndex = "${params.genome_base}/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.small.vcf.idx"
snpeffDb = "GRCh37.75"
}
}
}
28 changes: 19 additions & 9 deletions docs/REFERENCES.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# Genomes and reference files

Sarek currently uses GRCh38 by default. The settings are in `genomes.config`, they can be tailored to your needs.
Sarek currently uses GRCh38 by default.
The settings are in `genomes.config`, they can be tailored to your needs.
The [`buildReferences.nf`](#buildreferencesnf) script is used to build the indexes for the reference test.

## GRCh37

Use `--genome GRCh37` to map against GRCh37. Before doing so and if you are not on UPPMAX, you need to adjust the settings in `genomes.config` to your needs.
Use `--genome GRCh37` to map against GRCh37.
Before doing so and if you are not on UPPMAX, you need to adjust the settings in `genomes.config` to your needs.

### GATK bundle

Expand All @@ -20,21 +22,27 @@ The following files need to be downloaded:

### Other files for GRCh37

From our repo, get the [`intervals` list file](https://raw.githubusercontent.com/SciLifeLab/Sarek/master/repeats/wgs_calling_regions.grch37.list). More information about this file in the [intervals documentation](INTERVALS.md)
From our repo, get the [`intervals` list file](https://raw.githubusercontent.com/SciLifeLab/Sarek/master/repeats/wgs_calling_regions.grch37.list).
More information about this file in the [intervals documentation](INTERVALS.md)

Description of how to generate the Loci file used in the ASCAT process is described [here](https://github.com/SciLifeLab/Sarek/blob/master/docs/ASCAT.md).

You can create your own cosmic reference for any human reference as specified below in the Cosmic section.

## GRCh38

Use `--genome GRCh38` to map against GRCh38. Before doing so and if you are not on UPPMAX, you need to adjust the settings in `genomes.config` to your needs.
Use `--genome GRCh38` to map against GRCh38.
Before doing so and if you are not on UPPMAX, you need to adjust the settings in `genomes.config` to your needs.

To get the needed files, download the GATK bundle for GRCh38 from [ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/](ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/). You can also download the required files from the Google Cloud mirror link [here](https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0).
To get the needed files, download the GATK bundle for GRCh38 from [ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/](ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/).
You can also download the required files from the Google Cloud mirror link [here](https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0).

The MD5SUM of `Homo_sapiens_assembly38.fasta` included in that file is 7ff134953dcca8c8997453bbb80b6b5e.

If you download the data from the FTP servers `beta/` directory, which seems to be an older version of the bundle, only `Homo_sapiens_assembly38.known_indels.vcf` is needed. Also, you can omit `dbsnp_138_` and `dbsnp_144` files as we use `dbsnp_146`. The old ones also use the wrong chromosome naming convention. The Google Cloud mirror has all data in the `v0` directory, but requires you to remove the `resources_broad_hg38_v0_` prefixes from all files.
If you download the data from the FTP servers `beta/` directory, which seems to be an older version of the bundle, only `Homo_sapiens_assembly38.known_indels.vcf` is needed.
Also, you can omit `dbsnp_138_` and `dbsnp_144` files as we use `dbsnp_146`.
The old ones also use the wrong chromosome naming convention.
The Google Cloud mirror has all data in the `v0` directory, but requires you to remove the `resources_broad_hg38_v0_` prefixes from all files.

The following files need to be downloaded:

Expand Down Expand Up @@ -68,7 +76,8 @@ You can create your own cosmic reference for any human reference as specified be
To annotate with COSMIC variants during MuTect1/2 Variant Calling you need to create a compatible VCF file.
Download the coding and non-coding VCF files from [COSMIC](http://cancer.sanger.ac.uk/cosmic/download) and
process them with the [Create\_Cosmic.sh](https://github.com/SciLifeLab/Sarek/tree/master/scripts/Create_Cosmic.sh)
script for either GRCh37 or GRCh38. The script requires a fasta index `.fai`, of the reference file you are using.
script for either GRCh37 or GRCh38.
The script requires a fasta index `.fai`, of the reference file you are using.

Example:

Expand All @@ -87,11 +96,12 @@ igvtools index <cosmicvxx.vcf>

## smallGRCh37

Use `--genome smallGRCh37` to map against a small reference genome based on GRCh37. `smallGRCh37` is the default genome for the testing profile (`-profile testing`).
Use `--genome smallGRCh37` to map against a small reference genome based on GRCh37.
`smallGRCh37` is the default genome for the testing profile (`-profile testing`).

## AWS iGenomes
Sarek is using [AWS iGenomes](https://ewels.github.io/AWS-iGenomes/), which facilitate storing and sharing references.
Both `GRCh37` and `GRCh38` are available with `--genome iGRCh37` or `--genome iGRCh38` respectively, it contains all data previously detailed.
Both `GRCh37` and `GRCh38` are available with `--genome GRCh37` or `--genome GRCh38` respectively with any profile using the `conf/igenomes.config` file (eg.: `awsbatch`), or you can specify it with `-c conf/igenomes.config`, it contains all data previously detailed.

## buildReferences.nf

Expand Down
13 changes: 11 additions & 2 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ profiles {
// Singularity images need to be set up
standard {
includeConfig 'conf/base.config'
includeConfig 'conf/genomes.config'
includeConfig 'conf/uppmax-localhost.config'
includeConfig 'conf/singularity-path.config'
}
Expand All @@ -36,6 +37,7 @@ profiles {
// Singularity images need to be set up
slurm {
includeConfig 'conf/base.config'
includeConfig 'conf/genomes.config'
includeConfig 'conf/uppmax-slurm.config'
includeConfig 'conf/singularity-path.config'
}
Expand All @@ -44,6 +46,7 @@ profiles {
// Singularity images will be pulled automatically
slurmDownload {
includeConfig 'conf/base.config'
includeConfig 'conf/genomes.config'
includeConfig 'conf/uppmax-slurm.config'
includeConfig 'conf/singularity.config'
includeConfig 'conf/containers.config'
Expand All @@ -52,6 +55,7 @@ profiles {
// Docker images will be pulled automatically
docker {
includeConfig 'conf/base.config'
includeConfig 'conf/genomes.config'
includeConfig 'conf/travis.config'
includeConfig 'conf/docker.config'
includeConfig 'conf/containers.config'
Expand All @@ -60,6 +64,7 @@ profiles {
// Docker images will be pulled automatically
awsbatch {
includeConfig 'conf/base.config'
includeConfig 'conf/igenomes.config'
includeConfig 'conf/aws-batch.config'
includeConfig 'conf/docker.config'
includeConfig 'conf/containers.config'
Expand All @@ -68,6 +73,7 @@ profiles {
// Singularity images will be pulled automatically
singularity {
includeConfig 'conf/base.config'
includeConfig 'conf/genomes.config'
includeConfig 'conf/travis.config'
includeConfig 'conf/singularity.config'
includeConfig 'conf/containers.config'
Expand All @@ -76,6 +82,7 @@ profiles {
// Singularity images need to be set up
singularityPath {
includeConfig 'conf/base.config'
includeConfig 'conf/genomes.config'
includeConfig 'conf/travis.config'
includeConfig 'conf/singularity-path.config'
}
Expand All @@ -85,14 +92,16 @@ profiles {
// Singularity images will be pulled automatically
binac {
includeConfig 'conf/base.config'
includeConfig 'conf/genomes.config'
includeConfig 'conf/binac.config'
includeConfig 'conf/singularity.config'
includeConfig 'conf/resources.config'
includeConfig 'conf/containers.config'
}
// Default config for CFC cluster in Tuebingen/Germany
// Default config for CFC cluster in Tuebingen/Germany
cfc {
includeConfig 'conf/base.config'
includeConfig 'conf/genomes.config'
includeConfig 'conf/cfc.config'
includeConfig 'conf/singularity.config'
includeConfig 'conf/resources.config'
Expand Down Expand Up @@ -132,4 +141,4 @@ def check_max(obj, type) {
return obj
}
}
}
}

0 comments on commit df83874

Please sign in to comment.