diff --git a/main.nf b/main.nf index b465cc3..ea85b7d 100644 --- a/main.nf +++ b/main.nf @@ -23,7 +23,6 @@ * Evan Floden */ - /* * Define the default parameters */ @@ -34,7 +33,13 @@ params.blacklist = "$baseDir/data/blacklist.bed" params.reads = "$baseDir/data/reads/rep1_{1,2}.fq.gz" params.results = "results" params.gatk = '/usr/local/bin/GenomeAnalysisTK.jar' -params.gatk_launch = "java -jar $params.gatk" + + +/* + * Import modules + */ + +require 'modules.nf', params:[gatk: params.gatk, results: params.results] log.info """\ C A L L I N G S - N F v 1.0 @@ -51,7 +56,6 @@ gatk : $params.gatk * Parse the input parameters */ -GATK = params.gatk_launch genome_file = file(params.genome) variants_file = file(params.variants) blacklist_file = file(params.blacklist) @@ -64,160 +68,41 @@ reads_ch = Channel.fromFilePairs(params.reads) * Process 1A: Create a FASTA genome index (.fai) with samtools for GATK */ -process '1A_prepare_genome_samtools' { - tag "$genome.baseName" - - input: - file genome from genome_file - - output: - file "${genome}.fai" into genome_index_ch - - script: - """ - samtools faidx ${genome} - """ -} - +genome_index_ch = '1A_prepare_genome_samtools'(genome_file) /* * Process 1B: Create a FASTA genome sequence dictionary with Picard for GATK */ -process '1B_prepare_genome_picard' { - tag "$genome.baseName" - label 'mem_xlarge' - - input: - file genome from genome_file - output: - file "${genome.baseName}.dict" into genome_dict_ch - - script: - """ - PICARD=`which picard.jar` - java -jar \$PICARD CreateSequenceDictionary R= $genome O= ${genome.baseName}.dict - """ -} +genome_dict_ch = '1B_prepare_genome_picard'(genome_file) /* * Process 1C: Create STAR genome index file. */ -process '1C_prepare_star_genome_index' { - tag "$genome.baseName" - - input: - file genome from genome_file - output: - file "genome_dir" into genome_dir_ch - - script: - """ - mkdir genome_dir - - STAR --runMode genomeGenerate \ - --genomeDir genome_dir \ - --genomeFastaFiles ${genome} \ - --runThreadN ${task.cpus} - """ -} +genome_dir_ch = '1C_prepare_star_genome_index'(genome_file) /* * Process 1D: Create a file containing the filtered and recoded set of variants */ -process '1D_prepare_vcf_file' { - tag "$variantsFile.baseName" +prepared_vcf_ch = '1D_prepare_vcf_file'(variants_file, blacklist_file) - input: - file variantsFile from variants_file - file blacklisted from blacklist_file - - output: - set file("${variantsFile.baseName}.filtered.recode.vcf.gz"), file("${variantsFile.baseName}.filtered.recode.vcf.gz.tbi") into prepared_vcf_ch - - script: - """ - vcftools --gzvcf $variantsFile -c \ - --exclude-bed ${blacklisted} \ - --recode | bgzip -c \ - > ${variantsFile.baseName}.filtered.recode.vcf.gz - - tabix ${variantsFile.baseName}.filtered.recode.vcf.gz - """ -} - -/* - * END OF PART 1 - *********/ /********** - * PART 2: STAR RNA-Seq Mapping * * Process 2: Align RNA-Seq reads to the genome with STAR */ -process '2_rnaseq_mapping_star' { - tag "$replicateId" - - input: - file genome from genome_file - file genomeDir from genome_dir_ch - set replicateId, file(reads) from reads_ch - - output: - set replicateId, file('Aligned.sortedByCoord.out.bam'), file('Aligned.sortedByCoord.out.bam.bai') into aligned_bam_ch - - script: - """ - # ngs-nf-dev Align reads to genome - STAR --genomeDir $genomeDir \ - --readFilesIn $reads \ - --runThreadN ${task.cpus} \ - --readFilesCommand zcat \ - --outFilterType BySJout \ - --alignSJoverhangMin 8 \ - --alignSJDBoverhangMin 1 \ - --outFilterMismatchNmax 999 - - # 2nd pass (improve alignmets using table of splice junctions and create a new index) - mkdir genomeDir - STAR --runMode genomeGenerate \ - --genomeDir genomeDir \ - --genomeFastaFiles $genome \ - --sjdbFileChrStartEnd SJ.out.tab \ - --sjdbOverhang 75 \ - --runThreadN ${task.cpus} - - # Final read alignments - STAR --genomeDir genomeDir \ - --readFilesIn $reads \ - --runThreadN ${task.cpus} \ - --readFilesCommand zcat \ - --outFilterType BySJout \ - --alignSJoverhangMin 8 \ - --alignSJDBoverhangMin 1 \ - --outFilterMismatchNmax 999 \ - --outSAMtype BAM SortedByCoordinate \ - --outSAMattrRGline ID:$replicateId LB:library PL:illumina PU:machine SM:GM12878 - - # Index the BAM file - samtools index Aligned.sortedByCoord.out.bam - """ -} +aligned_bam_ch = '2_rnaseq_mapping_star'( genome_file, genome_dir_ch, reads_ch) -/* - * END OF PART 2 - ******/ /********** - * PART 3: GATK Prepare Mapped Reads * * Process 3: Split reads that contain Ns in their CIGAR string. * Creates k+1 new reads (where k is the number of N cigar elements) @@ -225,94 +110,22 @@ process '2_rnaseq_mapping_star' { * the splicing events represented by the Ns in the original CIGAR. */ -process '3_rnaseq_gatk_splitNcigar' { - tag "$replicateId" - label 'mem_large' - - input: - file genome from genome_file - file index from genome_index_ch - file genome_dict from genome_dict_ch - set replicateId, file(bam), file(index) from aligned_bam_ch - - output: - set replicateId, file('split.bam'), file('split.bai') into splitted_bam_ch - - script: - """ - # SplitNCigarReads and reassign mapping qualities - $GATK -T SplitNCigarReads \ - -R $genome -I $bam \ - -o split.bam \ - -rf ReassignOneMappingQuality \ - -RMQF 255 -RMQT 60 \ - -U ALLOW_N_CIGAR_READS \ - --fix_misencoded_quality_scores - """ -} - -/* - * END OF PART 3 - ******/ +splitted_bam_ch = '3_rnaseq_gatk_splitNcigar'(genome_file, genome_index_ch, genome_dict_ch, aligned_bam_ch) /*********** - * PART 4: GATK Base Quality Score Recalibration Workflow * * Process 4: Base recalibrate to detect systematic errors in base quality scores, * select unique alignments and index * */ -process '4_rnaseq_gatk_recalibrate' { - tag "$replicateId" - label 'mem_large' - - input: - file genome from genome_file - file index from genome_index_ch - file dict from genome_dict_ch - set replicateId, file(bam), file(index) from splitted_bam_ch - set file(variants_file), file(variants_file_index) from prepared_vcf_ch - - output: - set sampleId, file("${replicateId}.final.uniq.bam"), file("${replicateId}.final.uniq.bam.bai") into (final_output_ch, bam_for_ASE_ch) - - script: - sampleId = replicateId.replaceAll(/[12]$/,'') - """ - # Indel Realignment and Base Recalibration - $GATK -T BaseRecalibrator \ - --default_platform illumina \ - -cov ReadGroupCovariate \ - -cov QualityScoreCovariate \ - -cov CycleCovariate \ - -knownSites ${variants_file} \ - -cov ContextCovariate \ - -R ${genome} -I ${bam} \ - --downsampling_type NONE \ - -nct ${task.cpus} \ - -o final.rnaseq.grp - - $GATK -T PrintReads \ - -R ${genome} -I ${bam} \ - -BQSR final.rnaseq.grp \ - -nct ${task.cpus} \ - -o final.bam - - # Select only unique alignments, no multimaps - (samtools view -H final.bam; samtools view final.bam| grep -w 'NH:i:1') \ - |samtools view -Sb - > ${replicateId}.final.uniq.bam - - # Index BAM files - samtools index ${replicateId}.final.uniq.bam - """ -} - -/* - * END OF PART 4 - ******/ - +'4_rnaseq_gatk_recalibrate'( + genome_file, genome_index_ch, + genome_dict_ch, + splitted_bam_ch, + prepared_vcf_ch) + . into { final_output_ch; bam_for_ASE_ch } /*********** @@ -325,45 +138,12 @@ process '4_rnaseq_gatk_recalibrate' { */ -process '5_rnaseq_call_variants' { - tag "$sampleId" - label 'mem_large' - - input: - file genome from genome_file - file index from genome_index_ch - file dict from genome_dict_ch - set sampleId, file(bam), file(bai) from final_output_ch.groupTuple() - - output: - set sampleId, file('final.vcf') into vcf_files - - script: - """ - # fix absolute path in dict file - sed -i 's@UR:file:.*${genome}@UR:file:${genome}@g' $dict - echo "${bam.join('\n')}" > bam.list - - # Variant calling - $GATK -T HaplotypeCaller \ - -R $genome -I bam.list \ - -dontUseSoftClippedBases \ - -stand_call_conf 20.0 \ - -o output.gatk.vcf.gz - - # Variant filtering - $GATK -T VariantFiltration \ - -R $genome -V output.gatk.vcf.gz \ - -window 35 -cluster 3 \ - -filterName FS -filter "FS > 30.0" \ - -filterName QD -filter "QD < 2.0" \ - -o final.vcf - """ -} +vcf_files = '5_rnaseq_call_variants'( + genome_file, + genome_index_ch, + genome_dict_ch, + final_output_ch.groupTuple()) -/* - * END OF PART 5 - ******/ /*********** @@ -372,83 +152,13 @@ process '5_rnaseq_call_variants' { * Process 6A: Post-process the VCF result */ -process '6A_post_process_vcf' { - tag "$sampleId" - publishDir "$params.results/$sampleId" - - input: - set sampleId, file('final.vcf') from vcf_files - set file('filtered.recode.vcf.gz'), file('filtered.recode.vcf.gz.tbi') from prepared_vcf_ch - output: - set sampleId, file('final.vcf'), file('commonSNPs.diff.sites_in_files') into vcf_and_snps_ch - - script: - ''' - grep -v '#' final.vcf | awk '$7~/PASS/' |perl -ne 'chomp($_); ($dp)=$_=~/DP\\=(\\d+)\\;/; if($dp>=8){print $_."\\n"};' > result.DP8.vcf - - vcftools --vcf result.DP8.vcf --gzdiff filtered.recode.vcf.gz --diff-site --out commonSNPs - ''' -} +vcf_and_snps_ch = '6A_post_process_vcf'( vcf_files, prepared_vcf_ch ) /* * Process 6B: Prepare variants file for allele specific expression (ASE) analysis */ -process '6B_prepare_vcf_for_ase' { - tag "$sampleId" - publishDir "$params.results/$sampleId" - - input: - set sampleId, file('final.vcf'), file('commonSNPs.diff.sites_in_files') from vcf_and_snps_ch - output: - set sampleId, file('known_snps.vcf') into vcf_for_ASE - file('AF.histogram.pdf') into gghist_pdfs - - script: - ''' - awk 'BEGIN{OFS="\t"} $4~/B/{print $1,$2,$3}' commonSNPs.diff.sites_in_files > test.bed - - vcftools --vcf final.vcf --bed test.bed --recode --keep-INFO-all --stdout > known_snps.vcf - - grep -v '#' known_snps.vcf | awk -F '\\t' '{print $10}' \ - |awk -F ':' '{print $2}'|perl -ne 'chomp($_); \ - @v=split(/\\,/,$_); if($v[0]!=0 ||$v[1] !=0)\ - {print $v[1]/($v[1]+$v[0])."\\n"; }' |awk '$1!=1' \ - >AF.4R - - gghist.R -i AF.4R -o AF.histogram.pdf - ''' -} - - -/* - * Group data for allele-specific expression. - * - * The `bam_for_ASE_ch` emites tuples having the following structure, holding the final BAM/BAI files: - * - * ( sample_id, file_bam, file_bai ) - * - * The `vcf_for_ASE` channel emits tuples having the following structure, holding the VCF file: - * - * ( sample_id, output.vcf ) - * - * The BAMs are grouped together and merged with VCFs having the same sample id. Finally - * it creates a channel named `grouped_vcf_bam_bai_ch` emitting the following tuples: - * - * ( sample_id, file_vcf, List[file_bam], List[file_bai] ) - */ - -bam_for_ASE_ch - .groupTuple() - .phase(vcf_for_ASE) - .map{ left, right -> - def sampleId = left[0] - def bam = left[1] - def bai = left[2] - def vcf = right[1] - tuple(sampleId, vcf, bam, bai) - } - .set { grouped_vcf_bam_bai_ch } +(vcf_for_ASE, gghist_pdfs) = '6B_prepare_vcf_for_ase'( vcf_and_snps_ch ) /* @@ -458,31 +168,11 @@ bam_for_ASE_ch * (ASE) analysis */ -process '6C_ASE_knownSNPs' { - tag "$sampleId" - publishDir "$params.results/$sampleId" - label 'mem_large' - - input: - file genome from genome_file - file index from genome_index_ch - file dict from genome_dict_ch - set sampleId, file(vcf), file(bam), file(bai) from grouped_vcf_bam_bai_ch - - output: - file "ASE.tsv" - - script: - """ - echo "${bam.join('\n')}" > bam.list - - $GATK -R ${genome} \ - -T ASEReadCounter \ - -o ASE.tsv \ - -I bam.list \ - -sites ${vcf} - """ -} +'6C_ASE_knownSNPs'( + genome_file, + genome_index_ch, + genome_dict_ch, + group_per_sample(bam_for_ASE_ch, vcf_for_ASE) ) /* * END OF PART 6 diff --git a/modules.nf b/modules.nf new file mode 100644 index 0000000..836981e --- /dev/null +++ b/modules.nf @@ -0,0 +1,347 @@ +params.gatk = '/usr/local/bin/GenomeAnalysisTK.jar' +params.gatk_launch = "java -jar $params.gatk" +params.results = "results" + +GATK = params.gatk_launch + + +process '1A_prepare_genome_samtools' { + tag "$genome.baseName" + + input: + file genome + + output: + file "${genome}.fai" + + script: + """ + samtools faidx ${genome} + """ +} + +process '1B_prepare_genome_picard' { + tag "$genome.baseName" + label 'mem_xlarge' + + input: + file genome + output: + file "${genome.baseName}.dict" + + script: + """ + PICARD=`which picard.jar` + java -jar \$PICARD CreateSequenceDictionary R= $genome O= ${genome.baseName}.dict + """ +} + +process '1C_prepare_star_genome_index' { + tag "$genome.baseName" + + input: + file genome + output: + file "genome_dir" + + script: + """ + mkdir genome_dir + + STAR --runMode genomeGenerate \ + --genomeDir genome_dir \ + --genomeFastaFiles ${genome} \ + --runThreadN ${task.cpus} + """ +} + +process '1D_prepare_vcf_file' { + tag "$variantsFile.baseName" + + input: + file variantsFile + file blacklisted + + output: + set file("${variantsFile.baseName}.filtered.recode.vcf.gz"), + file("${variantsFile.baseName}.filtered.recode.vcf.gz.tbi") + + script: + """ + vcftools --gzvcf $variantsFile -c \ + --exclude-bed ${blacklisted} \ + --recode | bgzip -c \ + > ${variantsFile.baseName}.filtered.recode.vcf.gz + + tabix ${variantsFile.baseName}.filtered.recode.vcf.gz + """ +} + + +process '2_rnaseq_mapping_star' { + tag "$replicateId" + + input: + file genome + file genomeDir + set replicateId, file(reads) + + output: + set replicateId, + file('Aligned.sortedByCoord.out.bam'), + file('Aligned.sortedByCoord.out.bam.bai') + + script: + """ + # ngs-nf-dev Align reads to genome + STAR --genomeDir $genomeDir \ + --readFilesIn $reads \ + --runThreadN ${task.cpus} \ + --readFilesCommand zcat \ + --outFilterType BySJout \ + --alignSJoverhangMin 8 \ + --alignSJDBoverhangMin 1 \ + --outFilterMismatchNmax 999 + + # 2nd pass (improve alignmets using table of splice junctions and create a new index) + mkdir genomeDir + STAR --runMode genomeGenerate \ + --genomeDir genomeDir \ + --genomeFastaFiles $genome \ + --sjdbFileChrStartEnd SJ.out.tab \ + --sjdbOverhang 75 \ + --runThreadN ${task.cpus} + + # Final read alignments + STAR --genomeDir genomeDir \ + --readFilesIn $reads \ + --runThreadN ${task.cpus} \ + --readFilesCommand zcat \ + --outFilterType BySJout \ + --alignSJoverhangMin 8 \ + --alignSJDBoverhangMin 1 \ + --outFilterMismatchNmax 999 \ + --outSAMtype BAM SortedByCoordinate \ + --outSAMattrRGline ID:$replicateId LB:library PL:illumina PU:machine SM:GM12878 + + # Index the BAM file + samtools index Aligned.sortedByCoord.out.bam + """ +} + + +process '3_rnaseq_gatk_splitNcigar' { + tag "$replicateId" + label 'mem_large' + + input: + file genome + file index + file genome_dict + set replicateId, file(bam), file(index) + + output: + set replicateId, + file('split.bam'), + file('split.bai') + + script: + """ + # SplitNCigarReads and reassign mapping qualities + $GATK -T SplitNCigarReads \ + -R $genome -I $bam \ + -o split.bam \ + -rf ReassignOneMappingQuality \ + -RMQF 255 -RMQT 60 \ + -U ALLOW_N_CIGAR_READS \ + --fix_misencoded_quality_scores + """ +} + +process '4_rnaseq_gatk_recalibrate' { + tag "$replicateId" + label 'mem_large' + + input: + file genome + file index + file dict + set replicateId, file(bam), file(index) + set file(variants_file), file(variants_file_index) + + output: + set sampleId, + file("${replicateId}.final.uniq.bam"), + file("${replicateId}.final.uniq.bam.bai") + + script: + sampleId = replicateId.replaceAll(/[12]$/,'') + """ + # Indel Realignment and Base Recalibration + $GATK -T BaseRecalibrator \ + --default_platform illumina \ + -cov ReadGroupCovariate \ + -cov QualityScoreCovariate \ + -cov CycleCovariate \ + -knownSites ${variants_file} \ + -cov ContextCovariate \ + -R ${genome} -I ${bam} \ + --downsampling_type NONE \ + -nct ${task.cpus} \ + -o final.rnaseq.grp + + $GATK -T PrintReads \ + -R ${genome} -I ${bam} \ + -BQSR final.rnaseq.grp \ + -nct ${task.cpus} \ + -o final.bam + + # Select only unique alignments, no multimaps + (samtools view -H final.bam; samtools view final.bam| grep -w 'NH:i:1') \ + |samtools view -Sb - > ${replicateId}.final.uniq.bam + + # Index BAM files + samtools index ${replicateId}.final.uniq.bam + """ +} + + +process '5_rnaseq_call_variants' { + tag "$sampleId" + label 'mem_large' + + input: + file genome + file index + file dict + set sampleId, file(bam), file(bai) + + output: + set sampleId, file('final.vcf') + + script: + """ + # fix absolute path in dict file + sed -i 's@UR:file:.*${genome}@UR:file:${genome}@g' $dict + echo "${bam.join('\n')}" > bam.list + + # Variant calling + $GATK -T HaplotypeCaller \ + -R $genome -I bam.list \ + -dontUseSoftClippedBases \ + -stand_call_conf 20.0 \ + -o output.gatk.vcf.gz + + # Variant filtering + $GATK -T VariantFiltration \ + -R $genome -V output.gatk.vcf.gz \ + -window 35 -cluster 3 \ + -filterName FS -filter "FS > 30.0" \ + -filterName QD -filter "QD < 2.0" \ + -o final.vcf + """ +} + +process '6A_post_process_vcf' { + tag "$sampleId" + publishDir "$params.results/$sampleId" + + input: + set sampleId, file('final.vcf') + set file('filtered.recode.vcf.gz'), file('filtered.recode.vcf.gz.tbi') + output: + set sampleId, + file('final.vcf'), + file('commonSNPs.diff.sites_in_files') + + script: + ''' + grep -v '#' final.vcf | awk '$7~/PASS/' |perl -ne 'chomp($_); ($dp)=$_=~/DP\\=(\\d+)\\;/; if($dp>=8){print $_."\\n"};' > result.DP8.vcf + + vcftools --vcf result.DP8.vcf --gzdiff filtered.recode.vcf.gz --diff-site --out commonSNPs + ''' +} + +process '6B_prepare_vcf_for_ase' { + tag "$sampleId" + publishDir "$params.results/$sampleId" + + input: + set sampleId, file('final.vcf'), file('commonSNPs.diff.sites_in_files') + output: + set sampleId, file('known_snps.vcf') + file('AF.histogram.pdf') + + script: + ''' + awk 'BEGIN{OFS="\t"} $4~/B/{print $1,$2,$3}' commonSNPs.diff.sites_in_files > test.bed + + vcftools --vcf final.vcf --bed test.bed --recode --keep-INFO-all --stdout > known_snps.vcf + + grep -v '#' known_snps.vcf | awk -F '\\t' '{print $10}' \ + |awk -F ':' '{print $2}'|perl -ne 'chomp($_); \ + @v=split(/\\,/,$_); if($v[0]!=0 ||$v[1] !=0)\ + {print $v[1]/($v[1]+$v[0])."\\n"; }' |awk '$1!=1' \ + >AF.4R + + gghist.R -i AF.4R -o AF.histogram.pdf + ''' +} + + +process '6C_ASE_knownSNPs' { + tag "$sampleId" + publishDir "$params.results/$sampleId" + label 'mem_large' + + input: + file genome + file index + file dict + set sampleId, file(vcf), file(bam), file(bai) + + output: + file "ASE.tsv" + + script: + """ + echo "${bam.join('\n')}" > bam.list + + $GATK -R ${genome} \ + -T ASEReadCounter \ + -o ASE.tsv \ + -I bam.list \ + -sites ${vcf} + """ +} + + +/* + * Group data for allele-specific expression. + * + * The `bam_ch` emites tuples having the following structure, holding the final BAM/BAI files: + * + * ( sample_id, file_bam, file_bai ) + * + * The `vcf_ch` channel emits tuples having the following structure, holding the VCF file: + * + * ( sample_id, output.vcf ) + * + * The BAMs are grouped together and merged with VCFs having the same sample id. Finally + * it creates a channel named `grouped_vcf_bam_bai_ch` emitting the following tuples: + * + * ( sample_id, file_vcf, List[file_bam], List[file_bai] ) + */ + +def group_per_sample(bam_ch, vcf_ch) { + bam_ch + .groupTuple() + .phase(vcf_ch) + .map{ left, right -> + def sampleId = left[0] + def bam = left[1] + def bai = left[2] + def vcf = right[1] + tuple(sampleId, vcf, bam, bai) + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 3130c87..b369665 100644 --- a/nextflow.config +++ b/nextflow.config @@ -6,6 +6,10 @@ profiles { process.container = 'cbcrg/callings-nf:gatk4' params.gatk_launch = '/gatk-4.0.0.0/gatk' } + + test { + docker.enabled = true + } travis { docker.enabled = true