From bd540af7775be247650b44c44c43f5101dc38479 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Mon, 19 Nov 2018 10:06:20 +0100 Subject: [PATCH 1/2] add iGenomes --- conf/aws-batch.config | 2 +- conf/genomes.config | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/conf/aws-batch.config b/conf/aws-batch.config index ec8a9e913..a8d38fca5 100644 --- a/conf/aws-batch.config +++ b/conf/aws-batch.config @@ -8,7 +8,7 @@ */ params { - genome_base = params.genome == 'GRCh37' ? "s3://sarek-references/Homo_sapiens/GATK/GRCh37" : params.genome == 'iGRCh38' ? "s3://sarek-references/Homo_sapiens/GATK/GRCh38" : "s3://sarek-references/small" + genome_base = params.genome == 'iGRCh37' ? "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh37" : params.genome == 'iGRCh38' ? "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38" : "s3://sarek-references/small" publishDirMode = 'copy' } diff --git a/conf/genomes.config b/conf/genomes.config index b736bc9e4..95bd91ff8 100644 --- a/conf/genomes.config +++ b/conf/genomes.config @@ -42,6 +42,19 @@ params { //AF_files = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf" //AF_indexes = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf.idx" } + 'iGRCh37' { + acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_20130502_SNP_maf0.3.loci" + dbsnp = "${params.genome_base}/Annotation/GATKBundle/dbsnp_138.b37.vcf" + dbsnpIndex = "${params.genome_base}/Annotation/GATKBundle/dbsnp_138.b37.vcf.idx" + genomeFile = "${params.genome_base}/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta" + genomeDict = "${params.genome_base}/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.dict" + genomeIndex = "${params.genome_base}/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta.fai" + bwaIndex = "${params.genome_base}/Sequence/BWAIndex/human_g1k_v37_decoy.fasta.{amb,ann,bwt,pac,sa}" + intervals = "${params.genome_base}/Annotation/intervals/wgs_calling_regions_CAW.list" + knownIndels = "${params.genome_base}/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf" + knownIndelsIndex = "${params.genome_base}/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.idx" + snpeffDb = "GRCh37.75" + } 'iGRCh38' { acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_GRCh38_maf0.3.loci" dbsnp = "${params.genome_base}/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz" @@ -51,8 +64,8 @@ params { genomeIndex = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta.fai" bwaIndex = "${params.genome_base}/Sequence/BWAIndex/Homo_sapiens_assembly38.fasta.64.{alt,amb,ann,bwt,pac,sa}" intervals = "${params.genome_base}/Annotation/intervals/wgs_calling_regions.hg38.bed" - knownIndels = "${params.genome_base}/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,Homo_sapiens_assembly38.known_indels}.vcf.gz" - knownIndelsIndex = "${params.genome_base}/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" + knownIndels = "${params.genome_base}/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz" + knownIndelsIndex = "${params.genome_base}/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" snpeffDb = "GRCh38.86" } 'smallGRCh37' { From 892a96e0f2dbf79ea93864e62ecea26b41ded11d Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Mon, 26 Nov 2018 16:55:33 +0100 Subject: [PATCH 2/2] update CHANGELOG and README --- CHANGELOG.md | 1 + docs/REFERENCES.md | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c4963826..03bf50308 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#671](https://github.com/SciLifeLab/Sarek/pull/671) - New `publishDirMode` param and docs - [#673](https://github.com/SciLifeLab/Sarek/pull/673), [#675](https://github.com/SciLifeLab/Sarek/pull/675), [#676](https://github.com/SciLifeLab/Sarek/pull/676) - Profiles for BinAC and CFC clusters in Tübingen - [#679](https://github.com/SciLifeLab/Sarek/pull/679) - Add container for `CreateIntervalBeds` +- [#692](https://github.com/SciLifeLab/Sarek/pull/692) - Add AWS iGenomes possibilities (currently under `iGRCh37` and `iGRCh38`) ### `Changed` diff --git a/docs/REFERENCES.md b/docs/REFERENCES.md index 2de1923f8..eb6e21c6e 100644 --- a/docs/REFERENCES.md +++ b/docs/REFERENCES.md @@ -1,6 +1,7 @@ # Genomes and reference files -Sarek currently uses GRCh38 by default. The settings are in `genomes.config`, they can be tailored to your needs. The [`buildReferences.nf`](#buildreferencesnf) script can be use to build the indexes based on the reference files. +Sarek currently uses GRCh38 by default. The settings are in `genomes.config`, they can be tailored to your needs. +The [`buildReferences.nf`](#buildreferencesnf) script is used to build the indexes for the reference test. ## GRCh37 @@ -36,7 +37,7 @@ The MD5SUM of `Homo_sapiens_assembly38.fasta` included in that file is 7ff134953 If you download the data from the FTP servers `beta/` directory, which seems to be an older version of the bundle, only `Homo_sapiens_assembly38.known_indels.vcf` is needed. Also, you can omit `dbsnp_138_` and `dbsnp_144` files as we use `dbsnp_146`. The old ones also use the wrong chromosome naming convention. The Google Cloud mirror has all data in the `v0` directory, but requires you to remove the `resources_broad_hg38_v0_` prefixes from all files. The following files need to be downloaded: - + - 3884c62eb0e53fa92459ed9bff133ae6 - 'Homo_sapiens_assembly38.dict' - 7ff134953dcca8c8997453bbb80b6b5e - 'Homo_sapiens_assembly38.fasta' - b07e65aa4425bc365141756f5c98328c - 'Homo_sapiens_assembly38.fasta.64.alt' @@ -64,7 +65,7 @@ You can create your own cosmic reference for any human reference as specified be ## COSMIC files -To annotate with COSMIC variants during MuTect1/2 Variant Calling you need to create a compatible VCF file. +To annotate with COSMIC variants during MuTect1/2 Variant Calling you need to create a compatible VCF file. Download the coding and non-coding VCF files from [COSMIC](http://cancer.sanger.ac.uk/cosmic/download) and process them with the [Create\_Cosmic.sh](https://github.com/SciLifeLab/Sarek/tree/master/scripts/Create_Cosmic.sh) script for either GRCh37 or GRCh38. The script requires a fasta index `.fai`, of the reference file you are using. @@ -88,6 +89,10 @@ igvtools index Use `--genome smallGRCh37` to map against a small reference genome based on GRCh37. `smallGRCh37` is the default genome for the testing profile (`-profile testing`). +## AWS iGenomes +Sarek is using [AWS iGenomes](https://ewels.github.io/AWS-iGenomes/), which facilitate storing and sharing references. +Both `GRCh37` and `GRCh38` are available with `--genome iGRCh37` or `--genome iGRCh38` respectively, it contains all data previously detailed. + ## buildReferences.nf The `buildReferences.nf` script can download and build the files needed for smallGRCh37, or build the references for GRCh37/smallGRCh37.