From b8f63277f89dc6eefdfc19c441541353a596a880 Mon Sep 17 00:00:00 2001 From: npklein Date: Tue, 16 Jun 2015 10:52:06 +0200 Subject: [PATCH] Make PROseq pipeline folder --- .../protocols/AddOrReplaceReadGroups.sh | 66 +++++++++++ .../protocols/CollectMultipleMetrics.sh | 65 +++++++++++ .../PROseq/protocols/CollectRnaSeqMetrics.sh | 54 +++++++++ compute5/PROseq/protocols/CombineBedFiles.sh | 58 ++++++++++ compute5/PROseq/protocols/Fastqc.sh | 103 ++++++++++++++++++ .../PROseq/protocols/GatkAnalyseCovariates.sh | 83 ++++++++++++++ compute5/PROseq/protocols/GatkBQSR.sh | 83 ++++++++++++++ .../PROseq/protocols/GatkGenotypeGvcfs.sh | 68 ++++++++++++ .../protocols/GatkHaplotypeCallerGvcf.sh | 63 +++++++++++ .../protocols/GatkIndelRealignmentKnown.sh | 71 ++++++++++++ compute5/PROseq/protocols/GatkMergeGvcfs.sh | 61 +++++++++++ compute5/PROseq/protocols/GatkSplitAndTrim.sh | 85 +++++++++++++++ .../PROseq/protocols/GatkUnifiedGenotyper.sh | 69 ++++++++++++ compute5/PROseq/protocols/MarkDuplicates.sh | 58 ++++++++++ compute5/PROseq/protocols/MergeBamFiles.sh | 67 ++++++++++++ compute5/PROseq/protocols/SamToSortedBam.sh | 50 +++++++++ compute5/PROseq/protocols/VariantEval.sh | 45 ++++++++ compute5/PROseq/protocols/VerifyBamID.sh | 48 ++++++++ compute5/PROseq/workflow.csv | 16 +++ 19 files changed, 1213 insertions(+) create mode 100644 compute5/PROseq/protocols/AddOrReplaceReadGroups.sh create mode 100644 compute5/PROseq/protocols/CollectMultipleMetrics.sh create mode 100644 compute5/PROseq/protocols/CollectRnaSeqMetrics.sh create mode 100755 compute5/PROseq/protocols/CombineBedFiles.sh create mode 100644 compute5/PROseq/protocols/Fastqc.sh create mode 100644 compute5/PROseq/protocols/GatkAnalyseCovariates.sh create mode 100644 compute5/PROseq/protocols/GatkBQSR.sh create mode 100644 compute5/PROseq/protocols/GatkGenotypeGvcfs.sh create mode 100644 compute5/PROseq/protocols/GatkHaplotypeCallerGvcf.sh create mode 100644 compute5/PROseq/protocols/GatkIndelRealignmentKnown.sh create mode 100644 compute5/PROseq/protocols/GatkMergeGvcfs.sh create mode 100644 compute5/PROseq/protocols/GatkSplitAndTrim.sh create mode 100755 compute5/PROseq/protocols/GatkUnifiedGenotyper.sh create mode 100644 compute5/PROseq/protocols/MarkDuplicates.sh create mode 100644 compute5/PROseq/protocols/MergeBamFiles.sh create mode 100755 compute5/PROseq/protocols/SamToSortedBam.sh create mode 100644 compute5/PROseq/protocols/VariantEval.sh create mode 100755 compute5/PROseq/protocols/VerifyBamID.sh create mode 100644 compute5/PROseq/workflow.csv diff --git a/compute5/PROseq/protocols/AddOrReplaceReadGroups.sh b/compute5/PROseq/protocols/AddOrReplaceReadGroups.sh new file mode 100644 index 00000000..18d408d0 --- /dev/null +++ b/compute5/PROseq/protocols/AddOrReplaceReadGroups.sh @@ -0,0 +1,66 @@ +#MOLGENIS walltime=23:59:00 mem=6gb nodes=1 ppn=4 + +#Parameter mapping #why not string foo,bar? instead of string foo\nstring bar +#string stage +#string checkStage +#string starVersion +#string WORKDIR +#string projectDir + +#string picardVersion +#string starAlignmentPassTwoDir +#string sampleName +#string internalId + + +#string addOrReplaceGroupsDir +#string addOrReplaceGroupsBam +#string addOrReplaceGroupsBai + + + +echo "## "$(date)" ## $0 Started " + +getFile ${starAlignmentPassTwoDir}/Aligned.out.sam + +${stage} picard-tools/${picardVersion} +${checkStage} + +set -x +set -e + +mkdir -p ${addOrReplaceGroupsDir} + +echo "## "$(date)" Start $0" + +if java -Xmx6g -XX:ParallelGCThreads=4 -jar $PICARD_HOME/AddOrReplaceReadGroups.jar \ + INPUT=${starAlignmentPassTwoDir}/Aligned.out.sam \ + OUTPUT=${addOrReplaceGroupsBam} \ + SORT_ORDER=coordinate \ + RGID=${internalId} \ + RGLB=${sampleName}_${internalId} \ + RGPL=ILLUMINA \ + RGPU=${sampleName}_${internalId}_${internalId} \ + RGSM=${sampleName} \ + RGDT=$(date --rfc-3339=date) \ + CREATE_INDEX=true \ + MAX_RECORDS_IN_RAM=4000000 \ + TMP_DIR=${addOrReplaceGroupsDir} \ + +then + echo "returncode: $?"; + + putFile ${addOrReplaceGroupsBam} + putFile ${addOrReplaceGroupsBai} + + echo "succes moving files"; +else + echo "returncode: $?"; + echo "fail"; +fi + +echo "## "$(date)" ## $0 Done " + + + +echo "## "$(date)" ## $0 Done " diff --git a/compute5/PROseq/protocols/CollectMultipleMetrics.sh b/compute5/PROseq/protocols/CollectMultipleMetrics.sh new file mode 100644 index 00000000..df265661 --- /dev/null +++ b/compute5/PROseq/protocols/CollectMultipleMetrics.sh @@ -0,0 +1,65 @@ +#MOLGENIS walltime=23:59:00 mem=4gb nodes=1 ppn=4 + +#string stage +#string checkStage +#string picardVersion +#string RVersion +#string reads2FqGz +#string collectMultipleMetricsDir +#string collectMultipleMetricsPrefix +#string onekgGenomeFasta +#string sortedBam +#string sortedBai +#string toolDir + + +getFile ${sortedBam} +getFile ${sortedBai} +getFile ${onekgGenomeFasta} + + +#load modules +${stage} picard/${picardVersion} + +#Check modules +${checkStage} + +mkdir -p ${collectMultipleMetricsDir}_QC + +echo "## "$(date)" Start $0" + +insertSizeMetrics="" +if [ ${#reads2FqGz} -ne 0 ]; then + insertSizeMetrics="PROGRAM=CollectInsertSizeMetrics" +fi + +#Run Picard CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, QualityScoreDistribution and MeanQualityByCycle +if java -jar -Xmx4g -XX:ParallelGCThreads=4 ${toolDir}picard/${picardVersion}/CollectMultipleMetrics.jar \ + I=${sortedBam} \ + O=${collectMultipleMetricsPrefix} \ + R=${onekgGenomeFasta} \ + PROGRAM=CollectAlignmentSummaryMetrics \ + PROGRAM=QualityScoreDistribution \ + PROGRAM=MeanQualityByCycle \ + $insertSizeMetrics \ + TMP_DIR=${collectMultipleMetricsDir} +then + echo "returncode: $?"; + putFile ${collectMultipleMetricsPrefix}.alignment_summary_metrics + putFile ${collectMultipleMetricsPrefix}.quality_by_cycle_metrics + putFile ${collectMultipleMetricsPrefix}.quality_by_cycle.pdf + putFile ${collectMultipleMetricsPrefix}.quality_distribution_metrics + putFile ${collectMultipleMetricsPrefix}.quality_distribution.pdf + + if [ ${#reads2FqGz} -ne 0 ]; then + putFile ${collectMultipleMetricsPrefix}.insert_size_histogram.pdf + putFile ${collectMultipleMetricsPrefix}.insert_size_metrics + fi + echo "succes moving files"; +else + echo "returncode: $?"; + echo "fail"; +fi + + +echo "## "$(date)" ## $0 Done " diff --git a/compute5/PROseq/protocols/CollectRnaSeqMetrics.sh b/compute5/PROseq/protocols/CollectRnaSeqMetrics.sh new file mode 100644 index 00000000..08e02455 --- /dev/null +++ b/compute5/PROseq/protocols/CollectRnaSeqMetrics.sh @@ -0,0 +1,54 @@ +#MOLGENIS walltime=23:59:00 mem=8gb nodes=1 ppn=4 + +#string stage +#string checkStage +#string picardVersion +#string sortedBam +#string sortedBai +#string collectRnaSeqMetricsDir +#string collectRnaSeqMetrics +#string collectRnaSeqMetricsChart +#string genesRefFlat +#string rRnaIntervalList +#string onekgGenomeFasta +#string toolDir +#string RVersion + +getFile ${sortedBam} +getFile ${sortedBai} + +#Load module +${stage} picard/${picardVersion} +${stage} R/${RVersion} + +#Check modules +${checkStage} + +mkdir -p ${collectRnaSeqMetricsDir} + +echo "## "$(date)" ## $0 Started " + +if java -Xmx8g -XX:ParallelGCThreads=4 -jar ${toolDir}picard/${picardVersion}/CollectRnaSeqMetrics.jar \ + INPUT=${sortedBam} \ + OUTPUT=${collectRnaSeqMetrics} \ + CHART_OUTPUT=${collectRnaSeqMetricsChart} \ + METRIC_ACCUMULATION_LEVEL=SAMPLE \ + METRIC_ACCUMULATION_LEVEL=READ_GROUP \ + REFERENCE_SEQUENCE=${onekgGenomeFasta} \ + REF_FLAT=${genesRefFlat} \ + RIBOSOMAL_INTERVALS=${rRnaIntervalList} \ + STRAND_SPECIFICITY=NONE \ + TMP_DIR=${collectRnaSeqMetricsDir} + +then + echo "returncode: $?"; + putFile ${collectRnaSeqMetrics} + putFile ${collectRnaSeqMetricsChart} + + echo "succes moving files"; +else + echo "returncode: $?"; + echo "fail"; +fi + +echo "## "$(date)" ## $0 Done " diff --git a/compute5/PROseq/protocols/CombineBedFiles.sh b/compute5/PROseq/protocols/CombineBedFiles.sh new file mode 100755 index 00000000..287776f6 --- /dev/null +++ b/compute5/PROseq/protocols/CombineBedFiles.sh @@ -0,0 +1,58 @@ +#MOLGENIS walltime=23:59:00 mem=6gb ppn=4 + +#string stage +#string checkStage +#string projectDir +#list genotypeHarminzerOutput +#string combinedBEDDir +#string plinkVersion +#string genotypeHarmonizerDir + + + +getFile ${genotypeHarminzerOutput}.bed +getFile ${genotypeHarminzerOutput}.bim +getFile ${genotypeHarminzerOutput}.fam +getFile ${genotypeHarminzerOutput}.log + +#Load module +${stage} PLINK/${plinkVersion} + +#Check staging of module +${checkStage} + +mkdir -p ${combinedBEDDir} + + +echo "## "$(date)" ## $0 Started " + +{ +echo "$(printf '%s.bed %s.bim %s.fam\n' $(printf '%s\n' ${genotypeHarminzerOutput[@]}) $(printf '%s\n' ${genotypeHarminzerOutput[@]}) $(printf '%s\n' ${genotypeHarminzerOutput[@]}))" +} > ${combinedBEDDir}combinedFiles.txt.tmp +# remove first line (e.g. first sample) as this will be used as input for plink +# to which the other samples will be merged +sed '1d' ${combinedBEDDir}combinedFiles.txt.tmp > ${combinedBEDDir}combinedFiles.txt +rm ${combinedBEDDir}combinedFiles.txt.tmp + +if plink \ +--bfile ${genotypeHarminzerOutput[0]} \ +--merge-list ${combinedBEDDir}combinedFiles.txt \ +--make-bed \ +--out ${combinedBEDDir}combinedFiles + +then + echo "returncode: $?"; + putFile ${combinedBEDDir}combinedFiles.txt + putFile ${combinedBEDDir}combinedFiles.log + putFile ${combinedBEDDir}combinedFiles.bed + putFile ${combinedBEDDir}combinedFiles.bim + putFile ${combinedBEDDir}combinedFiles.fam + putFile ${combinedBEDDir}combinedFiles.nosex + + echo "succes moving files"; +else + echo "returncode: $?"; + echo "fail"; +fi + +echo "## "$(date)" ## $0 Done " diff --git a/compute5/PROseq/protocols/Fastqc.sh b/compute5/PROseq/protocols/Fastqc.sh new file mode 100644 index 00000000..baa634ea --- /dev/null +++ b/compute5/PROseq/protocols/Fastqc.sh @@ -0,0 +1,103 @@ +#MOLGENIS nodes=1 ppn=1 mem=1gb walltime=10:00:00 + +#string stage +#string checkStage +#string fastqcVersion +#string WORKDIR +#string projectDir +#string fastqcDir +#string fastqcZipExt +#string pairedEndfastqcZip1 +#string pairedEndfastqcZip2 +#string reads1FqGz +#string reads2FqGz +#string sampleName + +echo -e "test ${reads1FqGz} ${reads2FqGz} 1: $(basename ${reads1FqGz} .gz)${fastqcZipExt} \n2: $(basename ${reads2FqGz} .gz)${fastqcZipExt} " + +${stage} FastQC/${fastqcVersion} +${checkStage} + +set -e + +echo "## "$(date)" ## $0 Started " + +if [ ${#reads2FqGz} -eq 0 ]; then + + echo "## "$(date)" Started single end fastqc" + alloutputsexist \ + ${fastqcDir}/$(basename ${reads1FqGz} .gz)${fastqcZipExt} \ + ${singleEndfastqcZip} + + getFile ${reads1FqGz} + + mkdir -p ${fastqcDir} + cd ${fastqcDir} + + ################################################################## + echo + echo "## "$(date)" reads1FqGz" + if fastqc --noextract ${reads1FqGz} --outdir ${fastqcDir} + echo + cp -v ${fastqcDir}/$(basename ${reads1FqGz} .gz)${fastqcZipExt} ${singleEndfastqcZip} + + ################################################################## + + cd $OLDPWD + then + echo "returncode: $?"; + putFile ${fastqcDir}/$(basename ${reads1FqGz} .gz)${fastqcZipExt} + putFile ${singleEndfastqcZip} + echo "succes moving files"; + else + echo "returncode: $?"; + echo "fail"; + fi + + +else + echo "## "$(date)" Started paired end fastqc" + + alloutputsexist \ + ${fastqcDir}/$(basename ${reads1FqGz} .gz)${fastqcZipExt} \ + ${fastqcDir}/$(basename ${reads2FqGz} .gz)${fastqcZipExt} \ + ${pairedEndfastqcZip1} \ + ${pairedEndfastqcZip2} + + getFile ${reads1FqGz} + getFile ${reads2FqGz} + + mkdir -p ${fastqcDir} + cd ${fastqcDir} + + ################################################################## + echo + echo "## "$(date)" reads1FqGz" + if fastqc --noextract ${reads1FqGz} --outdir ${fastqcDir} + + cp -v ${fastqcDir}/$(basename ${reads1FqGz} .gz)${fastqcZipExt} ${pairedEndfastqcZip1} + echo + echo "## "$(date)" reads2FqGz" + fastqc --noextract ${reads2FqGz} --outdir ${fastqcDir} + echo + cp -v ${fastqcDir}/$(basename ${reads2FqGz} .gz)${fastqcZipExt} ${pairedEndfastqcZip2} + + ################################################################## + cd $OLDPWD + + then + echo "returncode: $?"; + putFile ${fastqcDir}/$(basename ${reads1FqGz} .gz)${fastqcZipExt} + putFile ${fastqcDir}/$(basename ${reads2FqGz} .gz)${fastqcZipExt} + putFile ${pairedEndfastqcZip1} + putFile ${pairedEndfastqcZip2} + + echo "succes moving files"; + else + echo "returncode: $?"; + echo "fail"; + fi +fi + + +echo "## "$(date)" ## $0 Done " diff --git a/compute5/PROseq/protocols/GatkAnalyseCovariates.sh b/compute5/PROseq/protocols/GatkAnalyseCovariates.sh new file mode 100644 index 00000000..9b9a3241 --- /dev/null +++ b/compute5/PROseq/protocols/GatkAnalyseCovariates.sh @@ -0,0 +1,83 @@ +#MOLGENIS nodes=1 ppn=2 mem=4gb walltime=23:59:00 + +#Parameter mapping #why not string foo,bar? instead of string foo\nstring bar +#string stage +#string checkStage +#string RVersion +#string gatkVersion +#string onekgGenomeFasta + +#string goldStandardVcf +#string goldStandardVcfIdx +#string oneKgPhase1IndelsVcf +#string oneKgPhase1IndelsVcfIdx + +#string dbsnpVcf +#string dbsnpVcfIdx + +#string bsqrDir +#string bsqrBam +#string bsqrBai + +#string analyseCovarsDir +#string bsqrBeforeGrp +#string bsqrAfterGrp +#string analyseCovariatesPdf + +echo "## "$(date)" ## $0 Started " + + + +getFile ${onekgGenomeFasta} +getFile ${oneKgPhase1IndelsVcf} +getFile ${oneKgPhase1IndelsVcfIdx} +getFile ${dbsnpVcf} +getFile ${dbsnpVcfIdx} +getFile ${goldStandardVcf} +getFile ${goldStandardVcfIdx} +getFile ${bsqrBam} +getFile ${bsqrBai} + +${stage} R/${RVersion} +${stage} GATK/${gatkVersion} +${checkStage} + +set -x +set -e + +mkdir -p ${analyseCovarsDir} + +#do bsqr for covariable determination then do print reads for valid bsqrbams +#check the bsqr part and add known variants + +java -Xmx4g -XX:ParallelGCThreads=2 -Djava.io.tmpdir=${bsqrDir} -jar $GATK_HOME/GenomeAnalysisTK.jar \ + -T BaseRecalibrator\ + -R ${onekgGenomeFasta} \ + -I ${bsqrBam} \ + -o ${bsqrAfterGrp} \ + -knownSites ${dbsnpVcf} \ + -knownSites ${goldStandardVcf}\ + -knownSites ${oneKgPhase1IndelsVcf}\ + -nct 2 + +if java -Xmx4g -XX:ParallelGCThreads=2 -Djava.io.tmpdir=${bsqrDir} -jar $GATK_HOME/GenomeAnalysisTK.jar \ + -T AnalyzeCovariates \ + -R ${onekgGenomeFasta} \ + -ignoreLMT \ + -before ${bsqrBeforeGrp} \ + -after ${bsqrAfterGrp} \ + -plots ${analyseCovariatesPdf} \ + +then + echo "returncode: $?"; + + putFile ${bsqrAfterGrp} + putFile ${analyseCovariatesPdf} + + echo "succes moving files"; +else + echo "returncode: $?"; + echo "fail"; +fi + +echo "## "$(date)" ## $0 Done " diff --git a/compute5/PROseq/protocols/GatkBQSR.sh b/compute5/PROseq/protocols/GatkBQSR.sh new file mode 100644 index 00000000..d3a4043c --- /dev/null +++ b/compute5/PROseq/protocols/GatkBQSR.sh @@ -0,0 +1,83 @@ +#MOLGENIS nodes=1 ppn=2 mem=4gb walltime=23:59:00 + +#Parameter mapping #why not string foo,bar? instead of string foo\nstring bar +#string stage +#string checkStage +#string samtoolsVersion +#string gatkVersion +#string onekgGenomeFasta +#string goldStandardVcf +#string goldStandardVcfIdx + +#string oneKgPhase1IndelsVcf +#string oneKgPhase1IndelsVcfIdx + +#string dbsnpVcf +#string dbsnpVcfIdx + +#string indelRealignmentBam +#string indelRealignmentBai +#string bsqrDir +#string bsqrBam +#string bsqrBai +#string bsqrBeforeGrp + +#pseudo from gatk forum (link: http://gatkforums.broadinstitute.org/discussion/3891/best-practices-for-variant-calling-on-rnaseq): +#java -jar GenomeAnalysisTK.jar -T SplitNCigarReads -R ref.fasta -I dedupped.bam -o split.bam -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS + +echo "## "$(date)" ## $0 Started " + + +getFile ${onekgGenomeFasta} +getFile ${oneKgPhase1IndelsVcf} +getFile ${oneKgPhase1IndelsVcfIdx} +getFile ${dbsnpVcf} +getFile ${dbsnpVcfIdx} +getFile ${goldStandardVcf} +getFile ${goldStandardVcfIdx} +getFile ${indelRealignmentBam} +getFile ${indelRealignmentBai} + +${stage} GATK/${gatkVersion} +${checkStage} + +set -x +set -e + +mkdir -p ${bsqrDir} + +#do bsqr for covariable determination then do print reads for valid bsqrbams +#check the bsqr part and add known variants + +java -Xmx4g -XX:ParallelGCThreads=2 -Djava.io.tmpdir=${bsqrDir} -jar $GATK_HOME/GenomeAnalysisTK.jar \ + -T BaseRecalibrator\ + -R ${onekgGenomeFasta} \ + -I ${indelRealignmentBam} \ + -o ${bsqrBeforeGrp} \ + -knownSites ${dbsnpVcf} \ + -knownSites ${goldStandardVcf}\ + -knownSites ${oneKgPhase1IndelsVcf}\ + -nct 2 + +if java -Xmx4g -XX:ParallelGCThreads=2 -Djava.io.tmpdir=${bsqrDir} -jar $GATK_HOME/GenomeAnalysisTK.jar \ + -T PrintReads \ + -R ${onekgGenomeFasta} \ + -I ${indelRealignmentBam} \ + -o ${bsqrBam} \ + -BQSR ${bsqrBeforeGrp} \ + -nct 82 + +then + echo "returncode: $?"; + + putFile ${bsqrBam} + putFile ${bsqrBai} + putFile ${bsqrBeforeGrp} + + echo "succes moving files"; +else + echo "returncode: $?"; + echo "fail"; +fi + +echo "## "$(date)" ## $0 Done " diff --git a/compute5/PROseq/protocols/GatkGenotypeGvcfs.sh b/compute5/PROseq/protocols/GatkGenotypeGvcfs.sh new file mode 100644 index 00000000..dfcf62a4 --- /dev/null +++ b/compute5/PROseq/protocols/GatkGenotypeGvcfs.sh @@ -0,0 +1,68 @@ +#MOLGENIS walltime=23:59:00 mem=4gb ppn=4 + +#Parameter mapping #why not string foo,bar? instead of string foo\nstring bar +#string stage +#string checkStage +#string starVersion +#string WORKDIR +#string projectDir +#string dbsnpVcf +#string dbsnpVcfIdx + +#string gatkVersion +#string haplotyperDir +#string onekgGenomeFasta + +#Array reference because it's possible +#list mergeGvcf +#list mergeGvcfIdx + +#string genotypedVcf +#string genotypedVcfIdx + +echo "## "$(date)" ## $0 Started " + + +for file in "${mergeGvcf[@]}" "${mergeGvcfIdx[@]}" "${onekgGenomeFasta}"; do + echo "getFile file='$file'" + getFile $file +done + +#Load gatk module +${stage} GATK/${gatkVersion} +${checkStage} + +set -x +set -e + + +# sort unique and print like ' --variant file1.vcf --variant file2.vcf ' +gvcfs=($(printf '%s\n' "${mergeGvcf[@]}" | sort -u )) + +inputs=$(printf ' --variant %s ' $(printf '%s\n' ${gvcfs[@]})) + +mkdir -p ${haplotyperDir} + +if java -Xmx4g -XX:ParallelGCThreads=4 -Djava.io.tmpdir=${haplotyperDir} -jar $GATK_HOME/GenomeAnalysisTK.jar \ + -T GenotypeGVCFs \ + -R ${onekgGenomeFasta} \ + --dbsnp ${dbsnpVcf}\ + -o ${genotypedVcf} \ + $inputs \ + -stand_call_conf 10.0 \ + -stand_emit_conf 20.0 \ + -nt 4 + +then + echo "returncode: $?"; + + putFile ${genotypedVcf} + putFile ${genotypedVcfIdx} + + echo "succes moving files"; +else + echo "returncode: $?"; + echo "fail"; +fi + +echo "## "$(date)" ## $0 Done "I diff --git a/compute5/PROseq/protocols/GatkHaplotypeCallerGvcf.sh b/compute5/PROseq/protocols/GatkHaplotypeCallerGvcf.sh new file mode 100644 index 00000000..45eb3d71 --- /dev/null +++ b/compute5/PROseq/protocols/GatkHaplotypeCallerGvcf.sh @@ -0,0 +1,63 @@ +#MOLGENIS walltime=23:59:00 mem=12gb ppn=2 + +#Parameter mapping #why not string foo,bar? instead of string foo\nstring bar +#string stage +#string checkStage +#string starVersion +#string WORKDIR +#string projectDir + +#string gatkVersion +#string dbsnpVcf +#string dbsnpVcfIdx +#string onekgGenomeFasta +#string indelRealignmentBam +#string indelRealignmentBai + +#string haplotyperDir +#string haplotyperGvcf +#string haplotyperGvcfIdx + + +echo "## "$(date)" ## $0 Started " + +for file in "${indelRealignmentBam[@]}" "${indelRealignmentBai[@]}" "${dbsnpVcf}" "${dbsnpVcfIdx}" "${onekgGenomeFasta}"; do + echo "getFile file='$file'" + getFile $file +done + +#Load gatk module +${stage} GATK/${gatkVersion} +${checkStage} + +#sort unique and print like 'INPUT=file1.bam INPUT=file2.bam ' +bams=($(printf '%s\n' "${indelRealignmentBam[@]}" | sort -u )) + +inputs=$(printf ' -I %s ' $(printf '%s\n' ${bams[@]})) + +mkdir -p ${haplotyperDir} + +if java -Xmx12g -XX:ParallelGCThreads=2 -Djava.io.tmpdir=${haplotyperDir} -jar $GATK_HOME/GenomeAnalysisTK.jar \ + -T HaplotypeCaller \ + -R ${onekgGenomeFasta} \ + --dbsnp ${dbsnpVcf}\ + $inputs \ + -dontUseSoftClippedBases \ + -stand_call_conf 10.0 \ + -stand_emit_conf 20.0 \ + -o ${haplotyperGvcf} \ + --emitRefConfidence GVCF + +then + echo "returncode: $?"; + + putFile ${haplotyperGvcf} + putFile ${haplotyperGvcfIdx} + + echo "succes moving files"; +else + echo "returncode: $?"; + echo "fail"; +fi + +echo "## "$(date)" ## $0 Done " diff --git a/compute5/PROseq/protocols/GatkIndelRealignmentKnown.sh b/compute5/PROseq/protocols/GatkIndelRealignmentKnown.sh new file mode 100644 index 00000000..9587f1f2 --- /dev/null +++ b/compute5/PROseq/protocols/GatkIndelRealignmentKnown.sh @@ -0,0 +1,71 @@ +#MOLGENIS nodes=1 ppn=2 mem=8gb walltime=23:59:00 + +#Parameter mapping #why not string foo,bar? instead of string foo\nstring bar +#string stage +#string checkStage +#string gatkVersion +#string onekgGenomeFasta +#string indelRealignmentTargets +#string goldStandardVcf +#string goldStandardVcfIdx +#string oneKgPhase1IndelsVcf +#string oneKgPhase1IndelsVcfIdx + +#string splitAndTrimBam +#string splitAndTrimBai + +#string indelRealignmentDir +#string indelRealignmentBam +#string indelRealignmentBai + +#pseudo from gatk forum (link: http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_indels_IndelRealigner): +#java -Xmx4g -jar GenomeAnalysisTK.jar -T IndelRealigner -R ref.fa -I input.bam -targetIntervals intervalListFromRTC.intervals -o realignedBam.bam [-known /path/to/indels.vcf] -U ALLOW_N_CIGAR_READS --allow_potentially_misencoded_quality_scores + +echo "## "$(date)" ## $0 Started " + + +${stage} GATK/${gatkVersion} +${checkStage} + +getFile ${onekgGenomeFasta} +getFile ${splitAndTrimBam} +getFile ${splitAndTrimBai} +getFile ${indelRealignmentTargets} +getFile ${oneKgPhase1IndelsVcf} +getFile ${goldStandardVcf} +getFile ${oneKgPhase1IndelsVcfIdx} +getFile ${goldStandardVcfIdx} + +set -x +set -e + +if [ ! -e ${indelRealignmentDir} ]; then + mkdir -p ${indelRealignmentDir} +fi + + +if java -Xmx8g -XX:ParallelGCThreads=2 -Djava.io.tmpdir=${indelRealignmentDir} -jar $GATK_HOME/GenomeAnalysisTK.jar \ + -T IndelRealigner \ + -R ${onekgGenomeFasta} \ + -I ${splitAndTrimBam} \ + -o ${indelRealignmentBam} \ + -targetIntervals ${indelRealignmentTargets} \ + -known ${oneKgPhase1IndelsVcf} \ + -known ${goldStandardVcf} \ + -U ALLOW_N_CIGAR_READS \ + --consensusDeterminationModel KNOWNS_ONLY \ + --LODThresholdForCleaning 0.4 \ + +then + echo "returncode: $?"; + + putFile ${indelRealignmentBam} + putFile ${indelRealignmentBai} + + echo "succes moving files"; +else + echo "returncode: $?"; + echo "fail"; +fi + +echo "## "$(date)" ## $0 Done " diff --git a/compute5/PROseq/protocols/GatkMergeGvcfs.sh b/compute5/PROseq/protocols/GatkMergeGvcfs.sh new file mode 100644 index 00000000..5db4ade9 --- /dev/null +++ b/compute5/PROseq/protocols/GatkMergeGvcfs.sh @@ -0,0 +1,61 @@ +#MOLGENIS walltime=23:59:00 mem=30gb ppn=2 +################################^advised 45 gb for 300 files so 30 for 200 files? +#Parameter mapping #why not string foo,bar? instead of string foo\nstring bar +#string stage +#string checkStage +#string starVersion +#string WORKDIR +#string projectDir +#string onekgGenomeFasta +#string gatkVersion +#list haplotyperGvcf +#list haplotyperGvcfIdx + +#string haplotyperDir +#string mergeGvcf +#string mergeGvcfIdx + +echo "## "$(date)" ## $0 Started " + + + +for file in "${haplotyperGvcf[@]}" "${haplotyperGvcfIdx[@]}" "${onekgGenomeFasta}"; do + echo "getFile file='$file'" + getFile $file +done + +#Load gatk module +${stage} GATK/${gatkVersion} +${checkStage} + +set -x +set -e + +#sort unique and print like 'INPUT=file1.bam INPUT=file2.bam ' +gvcfs=($(printf '%s\n' "${haplotyperGvcf[@]}" | sort -u )) + +inputs=$(printf ' --variant %s ' $(printf '%s\n' ${gvcfs[@]})) + +mkdir -p ${haplotyperDir} + +#pseudo: java -jar GenomeAnalysisTK.jar -T HaplotypeCaller -R ref.fasta -I input.bam -recoverDanglingHeads -dontUseSoftClippedBases -stand_call_conf 20.0 -stand_emit_conf 20.0 -o output.vcf from http://gatkforums.broadinstitute.org/discussion/3891/calling-variants-in-rnaseq + +if java -Xmx30g -XX:ParallelGCThreads=2 -Djava.io.tmpdir=${haplotyperDir} -jar $GATK_HOME/GenomeAnalysisTK.jar \ + -T CombineGVCFs \ + -R ${onekgGenomeFasta} \ + -o ${mergeGvcf} \ + $inputs + +then + echo "returncode: $?"; + + putFile ${mergeGvcf} + putFile ${mergeGvcf} + + echo "succes moving files"; +else + echo "returncode: $?"; + echo "fail"; +fi + +echo "## "$(date)" ## $0 Done " diff --git a/compute5/PROseq/protocols/GatkSplitAndTrim.sh b/compute5/PROseq/protocols/GatkSplitAndTrim.sh new file mode 100644 index 00000000..2b5d8401 --- /dev/null +++ b/compute5/PROseq/protocols/GatkSplitAndTrim.sh @@ -0,0 +1,85 @@ +#MOLGENIS nodes=1 ppn=2 mem=8gb walltime=23:59:00 + +#Parameter mapping #why not string foo,bar? instead of string foo\nstring bar +#string stage +#string checkStage +#string samtoolsVersion +#string gatkVersion +#string markDuplicatesBam +#string markDuplicatesBai +#string onekgGenomeFasta + +#string splitAndTrimBam +#string splitAndTrimBai +#string splitAndTrimDir + +echo "## "$(date)" ## $0 Started " + + +getFile ${onekgGenomeFasta} +getFile ${markDuplicatesBam} +getFile ${markDuplicatesBai} + +${stage} samtools/${samtoolsVersion} +${stage} GATK/${gatkVersion} +${checkStage} + +set -x +set -e + +mkdir -p ${splitAndTrimDir} + +#it botches on base quality scores use --allow_potentially_misencoded_quality_scores / the tool is not paralel with nt/nct +qualAction=$(samtools view ${markDuplicatesBam} | \ + head -1000000 | \ + awk '{gsub(/./,"&\n",$11);print $11}'| \ + sort -u| \ + perl -wne ' + $_=ord($_); + print $_."\n"if(not($_=~/10/));' | \ + sort -n | \ + perl -wne ' + use strict; + use List::Util qw/max min/; + my @ords=; + if(min(@ords) >= 59 && max(@ords) <=104 ){ + print " --fix_misencoded_quality_scores "; + warn "Illumina <= 1.7 scores detected using:--fix_misencoded_quality_scores.\n"; + }elsif(min(@ords) >= 33 && max(@ords) <= 74){ + print " "; + warn "quals > illumina 1.8 detected no action to take.\n"; + }elsif(min(@ords) >= 33 && max(@ords) <= 80){ + print " --allow_potentially_misencoded_quality_scores "; + warn "Strange illumina like quals detected using:--allow_potentially_misencoded_quality_scores." + }else{ + die "Cannot estimate quality scores here is the list:".join(",",@ords)."\n"; + } +') +echo +echo "## Action to perform in quals: "$qualAction" ##" +echo + +if java -Xmx8g -XX:ParallelGCThreads=2 -Djava.io.tmpdir=${splitAndTrimDir} -jar $GATK_HOME/GenomeAnalysisTK.jar \ + -T SplitNCigarReads \ + -R ${onekgGenomeFasta} \ + -I ${markDuplicatesBam} \ + -o ${splitAndTrimBam} \ + -rf ReassignOneMappingQuality \ + -RMQF 255 \ + -RMQT 60 \ + -U ALLOW_N_CIGAR_READS \ + $qualAction + +then + echo "returncode: $?"; + + putFile ${splitAndTrimBam} + putFile ${splitAndTrimBai} + + echo "succes moving files"; +else + echo "returncode: $?"; + echo "fail"; +fi + +echo "## "$(date)" ## $0 Done " diff --git a/compute5/PROseq/protocols/GatkUnifiedGenotyper.sh b/compute5/PROseq/protocols/GatkUnifiedGenotyper.sh new file mode 100755 index 00000000..5b12d898 --- /dev/null +++ b/compute5/PROseq/protocols/GatkUnifiedGenotyper.sh @@ -0,0 +1,69 @@ +#MOLGENIS walltime=23:59:00 nodes=1 mem=8gb ppn=4 + +#string stage +#string checkStage +#string onekgGenomeFasta +#string gatkVersion +#list sortedBam +#string sampleName +#string unifiedGenotyperDir +#string internalId +#string dbsnpVcf +#string toolDir +#string testIntervalList +#string rawVCF +#string uniqueID +#string jdkVersion +#string tabixVersion + + + +getFile ${dbsnpVcf} +getFile ${onekgGenomeFasta} +for file in "${sortedBam[@]}"; do + echo "getFile file='$file'" + getFile $file +done + +#Load modules +${stage} GATK/${gatkVersion} +${stage} tabix/${tabixVersion} + +#check modules +${checkStage} + +mkdir -p ${unifiedGenotyperDir} + +echo "## "$(date)" Start $0" + +#print like '-I=file1.bam -I=file2.bam ' +inputs=$(printf ' -I %s ' $(printf '%s\n' ${sortedBam[@]})) + +if java -Xmx8g -XX:ParallelGCThreads=4 -jar ${toolDir}GATK//${gatkVersion}/GenomeAnalysisTK.jar \ + -R ${onekgGenomeFasta} \ + -T UnifiedGenotyper \ + $inputs \ + -L ${testIntervalList} \ + --dbsnp ${dbsnpVcf} \ + -o ${rawVCF} \ + -U ALLOW_N_CIGAR_READS \ + -rf ReassignMappingQuality \ + -DMQ 60 + +# have to gzip for GenomeHarnomizer use later +bgzip -c ${rawVCF} > ${rawVCF}.gz +tabix -p vcf ${rawVCF}.gz + +then + echo "returncode: $?"; + putFile ${rawVCF} + putFile ${rawVCF}.gz + putFile ${rawVCF}.gz.tbi + putFile ${rawVCF}.idx + echo "succes moving files"; +else + echo "returncode: $?"; + echo "fail"; +fi + +echo "## "$(date)" ## $0 Done " diff --git a/compute5/PROseq/protocols/MarkDuplicates.sh b/compute5/PROseq/protocols/MarkDuplicates.sh new file mode 100644 index 00000000..6af315ab --- /dev/null +++ b/compute5/PROseq/protocols/MarkDuplicates.sh @@ -0,0 +1,58 @@ +#MOLGENIS walltime=23:59:00 mem=6gb nodes=1 ppn=4 + +#Parameter mapping #why not string foo,bar? instead of string foo\nstring bar +#string stage +#string checkStage +#string starVersion +#string WORKDIR +#string projectDir + +#string picardVersion + +#string mergeBamFilesBam +#string mergeBamFilesBai + +#string markDuplicatesDir +#string markDuplicatesBam +#string markDuplicatesBai +#string markDuplicatesMetrics + + +echo "## "$(date)" ## $0 Started " + + +getFile ${mergeBamFilesBam} +getFile ${mergeBamFilesBai} + +${stage} picard-tools/${picardVersion} +${checkStage} + +set -x +set -e + +mkdir -p ${markDuplicatesDir} + +if java -Xmx6g -XX:ParallelGCThreads=4 -jar $PICARD_HOME/MarkDuplicates.jar \ + INPUT=${mergeBamFilesBam} \ + OUTPUT=${markDuplicatesBam} \ + CREATE_INDEX=true \ + MAX_RECORDS_IN_RAM=4000000 \ + TMP_DIR=${markDuplicatesDir} \ + METRICS_FILE=${markDuplicatesMetrics} + +#REMOVE_DUPLICATES=true \? + +then + echo "returncode: $?"; + + putFile ${markDuplicatesBam} + putFile ${markDuplicatesBai} + putFile ${markDuplicatesMetrics} + + echo "succes moving files"; +else + echo "returncode: $?"; + echo "fail"; +fi + +echo "## "$(date)" ## $0 Done " diff --git a/compute5/PROseq/protocols/MergeBamFiles.sh b/compute5/PROseq/protocols/MergeBamFiles.sh new file mode 100644 index 00000000..d26849fc --- /dev/null +++ b/compute5/PROseq/protocols/MergeBamFiles.sh @@ -0,0 +1,67 @@ +#MOLGENIS walltime=23:59:00 mem=6gb ppn=4 + +#Parameter mapping #why not string foo,bar? instead of string foo\nstring bar +#string stage +#string checkStage +#string starVersion +#string WORKDIR +#string projectDir + +#string picardVersion + + +#string addOrReplaceGroupsDir +#list addOrReplaceGroupsBam, addOrReplaceGroupsBai + +#string mergeBamFilesDir +#string mergeBamFilesBam +#string mergeBamFilesBai + + + +echo "## "$(date)" ## $0 Started " + +for file in "${addOrReplaceGroupsBam[@]}" "${addOrReplaceGroupsBai[@]}"; do + echo "getFile file='$file'" + getFile $file +done + +#Load Picard module +${stage} picard-tools/${picardVersion} +${checkStage} + +set -o posix + +set -x +set -e + +#${addOrReplaceGroupsBam} sort unique and print like 'INPUT=file1.bam INPUT=file2.bam ' +bams=($(printf '%s\n' "${addOrReplaceGroupsBam[@]}" | sort -u )) +inputs=$(printf 'INPUT=%s ' $(printf '%s\n' ${bams[@]})) + +mkdir -p ${mergeBamFilesDir} + +if java -jar -XX:ParallelGCThreads=4 -Xmx6g $PICARD_HOME/MergeSamFiles.jar \ + $inputs \ + SORT_ORDER=coordinate \ + CREATE_INDEX=true \ + USE_THREADING=true \ + TMP_DIR=${mergeBamFilesDir} \ + MAX_RECORDS_IN_RAM=6000000 \ + OUTPUT=${mergeBamFilesBam} \ + +# VALIDATION_STRINGENCY=LENIENT \ + +then + echo "returncode: $?"; + + putFile ${mergeBamFilesBam} + putFile ${mergeBamFilesBai} + + echo "succes moving files"; +else + echo "returncode: $?"; + echo "fail"; +fi + +echo "## "$(date)" ## $0 Done " diff --git a/compute5/PROseq/protocols/SamToSortedBam.sh b/compute5/PROseq/protocols/SamToSortedBam.sh new file mode 100755 index 00000000..e6bcaf35 --- /dev/null +++ b/compute5/PROseq/protocols/SamToSortedBam.sh @@ -0,0 +1,50 @@ +#MOLGENIS walltime=23:59:00 nodes=1 mem=6gb ppn=4 + +#string stage +#string checkStage +#string sampleName +#string nTreads +#string internalId +#string platform +#string picardVersion +#string toolDir +#string hisatAlignmentDir +#string sortedBamDir +#string sortedBam +#string sortedBai +#string uniqueID +#string jdkVersion + + + +getFile ${hisatAlignmentDir}${uniqueID}.sam + +#Load modules +${stage} picard/${picardVersion} + +#check modules +${checkStage} + +mkdir -p ${sortedBamDir} + +echo "## "$(date)" Start $0" + +if java -Xmx6g -XX:ParallelGCThreads=4 -jar ${toolDir}picard/${picardVersion}/SortSam.jar \ + INPUT=${hisatAlignmentDir}${uniqueID}.sam \ + OUTPUT=${sortedBam} \ + SO=coordinate \ + CREATE_INDEX=true \ + TMP_DIR=${sortedBamDir} + +then + echo "returncode: $?"; putFile ${sortedBam} + putFile ${sortedBai} + + echo "succes moving files"; +else + echo "returncode: $?"; + echo "fail"; +fi + +echo "## "$(date)" ## $0 Done " + diff --git a/compute5/PROseq/protocols/VariantEval.sh b/compute5/PROseq/protocols/VariantEval.sh new file mode 100644 index 00000000..50f1062c --- /dev/null +++ b/compute5/PROseq/protocols/VariantEval.sh @@ -0,0 +1,45 @@ +#MOLGENIS walltime=23:59:00 mem=8gb nodes=1 ppn=4 + +#string stage +#string checkStage +#string onekgGenomeFasta +#string gatkVersion +#string sampleName +#string internalId +#string toolDir +#string rawVCF +#string uniqueID +#string jdkVersion +#string variantEvalDir +#string evalGrp + + +getFile ${rawVCF} + +#Load modules +${stage} GATK/${gatkVersion} + +#check modules +${checkStage} + +mkdir -p ${variantEvalDir} + +echo "## "$(date)" ## $0 Started " + +if java -Xmx8g -XX:ParallelGCThreads=4 -jar ${toolDir}GATK/${gatkVersion}/GenomeAnalysisTK.jar \ + -T VariantEval \ + -R ${onekgGenomeFasta} \ + -o ${evalGrp} \ + --eval ${rawVCF} \ + +then + echo "returncode: $?"; + putFile ${evalGrp} + echo "succes moving file"; + +else + echo "returncode: $?"; + echo "fail"; +fi + +echo "## "$(date)" ## $0 Done " diff --git a/compute5/PROseq/protocols/VerifyBamID.sh b/compute5/PROseq/protocols/VerifyBamID.sh new file mode 100755 index 00000000..e4c5c0ad --- /dev/null +++ b/compute5/PROseq/protocols/VerifyBamID.sh @@ -0,0 +1,48 @@ +#MOLGENIS walltime=23:59:00 nodes=1 mem=4gb ppn=1 + +#string verifyBamIdDir +#string unifiedGenotyperDir +#string internalId +#string sampleName +#string sortedBam +#string sortedBai +#string uniqueID +#string verifyBamIDVersion +#string stage +#string checkStage + + + +getFile ${unifiedGenotyperDir}${uniqueID}.raw.vcf +getFile ${sortedBam} +getFile ${sortedBai} + +#Load modules +${stage} verifyBamID/${verifyBamIDVersion} + +#check modules +${checkStage} + +mkdir -p ${verifyBamIdDir} + +echo "## "$(date)" Start $0" + +if verifyBamID \ + --vcf ${unifiedGenotyperDir}${uniqueID}.raw.vcf \ + --bam ${sortedBam} \ + --out ${verifyBamIdDir}${uniqueID} + +then + echo "returncode: $?"; putFile ${verifyBamIdDir}${uniqueID}.depthRG + putFile ${verifyBamIdDir}${uniqueID}.depthSM + putFile ${verifyBamIdDir}${uniqueID}.log + putFile ${verifyBamIdDir}${uniqueID}.selfRG + putFile ${verifyBamIdDir}${uniqueID}.selfSM + + echo "succes moving files"; +else + echo "returncode: $?"; + echo "fail"; +fi + +echo "## "$(date)" ## $0 Done " diff --git a/compute5/PROseq/workflow.csv b/compute5/PROseq/workflow.csv new file mode 100644 index 00000000..957a2502 --- /dev/null +++ b/compute5/PROseq/workflow.csv @@ -0,0 +1,16 @@ +step,protocol,dependencies +Fastqc,protocols/Fastqc.sh, +StarRunAlignmentP1,protocols/StarRunAlignmentP1.sh, +StarRunAlignmentP2,protocols/StarRunAlignmentP2.sh,StarRunAlignmentP1 +AddOrReplaceReadGroups,protocols/AddOrReplaceReadGroups.sh,StarRunAlignmentP2 +MergeBamFiles,protocols/MergeBamFiles.sh,AddOrReplaceReadGroups +MarkDuplicates,protocols/MarkDuplicates.sh,MergeBamFiles +CollectRnaSeqMetrics,protocols/CollectRnaSeqMetrics.sh,MarkDuplicates +CollectMultipleMetrics,protocols/CollectMultipleMetrics.sh,MarkDuplicates +GatkSplitAndTrim,protocols/GatkSplitAndTrim.sh,MarkDuplicates +IndelRealignmentKnown,protocols/GatkIndelRealignmentKnown.sh,GatkSplitAndTrim +BQSR,protocols/GatkBQSR.sh,IndelRealignmentKnown +AnalyseCovariates,protocols/GatkAnalyseCovariates.sh,BQSR +HaplotypeCallerGvcf,protocols/GatkHaplotypeCallerGvcf.sh,BQSR +MergeGvcfs,protocols/GatkMergeGvcfs.sh,HaplotypeCallerGvcf +GenotypeGvcfs,protocols/GatkGenotypeGvcfs.sh,MergeGvcfs