diff --git a/README.md b/README.md index 7155bf42..c359de52 100755 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ The bwa-mem command from Burrows-Wheeler Aligner(BWA) [[2]](#r2) is used to alig The GATK [[4]](#r4) HaplotypeCaller estimates the most likely genotypes and allele frequencies in an alignment using a Bayesian likelihood model for every position of the genome regardless of whether a variant was detected at that site or not. This information can later be used in the project based genotyping step. A joint analysis has been performed of all the samples in the project. This leads to a posterior probability of a variant allele at a site. SNPs and small Indels are written to a VCF file, along with information such as genotype quality, allele frequency, strand bias and read depth for that SNP/Indel. Based on quality thresholds from the GATK "best practices" [[5]](#r5). The SNPs and indels are filtered and marked as Lowqual or Pass resulting in a final VCF file. + ### References 1. Andrews S. (2010). FastQC: a quality control tool for high throughput sequence data. Available online at:http://www.bioinformatics.babraham.ac.uk/projects/fastqc diff --git a/protocols/CoverageCalculations.sh b/protocols/CoverageCalculations.sh index eda80c2a..1fc7dfd3 100755 --- a/protocols/CoverageCalculations.sh +++ b/protocols/CoverageCalculations.sh @@ -78,7 +78,7 @@ then awk -v OFS='\t' '{print $1,$3}' "${sampleNameID}.${perTarget}.coveragePerTarget.sample_interval_summary" | sed '1d' > "${sampleNameID}.${perTarget}.coveragePerTarget.coveragePerTarget.txt.tmp.tmp" sort -V "${sampleNameID}.${perTarget}.coveragePerTarget.coveragePerTarget.txt.tmp.tmp" > "${sampleNameID}.${perTarget}.coveragePerTarget.coveragePerTarget.txt.tmp" - perl -pi -e 's|-|\^|' "${perTargetDir}/${perTarget}.genesOnly" > "${sampleNameID}.${perTarget}.coveragePerTarget.genesOnly.tmp" + perl -p -e 's|-|\^|' "${perTargetDir}/${perTarget}.genesOnly" > "${sampleNameID}.${perTarget}.coveragePerTarget.genesOnly.tmp" paste "${sampleNameID}.${perTarget}.coveragePerTarget.coveragePerTarget.txt.tmp" "${sampleNameID}.${perTarget}.coveragePerTarget.genesOnly.tmp" > "${sampleNameID}.${perTarget}.coveragePerTarget_inclGenes.txt" ##Paste command produces ^M character diff --git a/protocols/CreateExternSamplesProjects.sh b/protocols/CreateExternSamplesProjects.sh index 51c46623..9bfa8881 100755 --- a/protocols/CreateExternSamplesProjects.sh +++ b/protocols/CreateExternSamplesProjects.sh @@ -33,6 +33,10 @@ #list lane #string ngsUtilsVersion +#string dataDir +#string coveragePerBaseDir +#string coveragePerTargetDir + set -e set -u @@ -113,10 +117,29 @@ extract_samples_from_GAF_list.pl --i "${worksheet}" --o "${projectJobsDir}/${pro batching="_small" -capturingKitProject=$(python ${EBROOTNGS_DNA}/scripts/getCapturingKit.py "${projectJobsDir}/${project}.csv") +capturingKitProject=$(python ${EBROOTNGS_DNA}/scripts/getCapturingKit.py "${projectJobsDir}/${project}.csv" | sed 's|\\||' ) +captKit=$(echo "capturingKitProject" | awk 'BEGIN {FS="/"}{print $2}') + +if [ ! -d "${dataDir}/${capturingKitProject}" ] +then + echo "Bedfile does not exist! Exiting" + exit 1 +fi + if [[ "${capturingKitProject}" == *"Exoom"* || "${capturingKitProject}" == *"All_Exon_v1"* || "${capturingKitProject}" == *"wgs"* || "${capturingKitProject}" == *"WGS"* ]] then - batching="_chr" + batching="_chr" + if [ ! -e "${coveragePerTargetDir}/${captKit}/${captKit}" ] + then + echo "Bedfile in ${coveragePerTargetDir} does not exist! Exiting" + exit 1 + fi +else + if [ ! -e "${coveragePerBaseDir}/${captKit}/${captKit}" ] + then + echo "Bedfile in ${coveragePerBaseDir} does not exist! Exiting" + exit 1 + fi fi if [ -f .compute.properties ]; diff --git a/protocols/CreateInhouseProjects.sh b/protocols/CreateInhouseProjects.sh index ce22f771..6b4193c6 100755 --- a/protocols/CreateInhouseProjects.sh +++ b/protocols/CreateInhouseProjects.sh @@ -31,6 +31,11 @@ #string ngsversion #string ngsUtilsVersion +#string dataDir + +#string coveragePerBaseDir +#string coveragePerTargetDir + #string project #string logsDir @@ -118,12 +123,32 @@ fi batching="_small" -capturingKitProject=$(python ${EBROOTNGS_DNA}/scripts/getCapturingKit.py "${projectJobsDir}/${project}.csv") +capturingKitProject=$(python ${EBROOTNGS_DNA}/scripts/getCapturingKit.py "${projectJobsDir}/${project}.csv" | sed 's|\\||') +captKit=$(echo "capturingKitProject" | awk 'BEGIN {FS="/"}{print $2}') + +if [ ! -d "${dataDir}/${capturingKitProject}" ] +then + echo "Bedfile does not exist! Exiting" + exit 1 +fi + if [[ "${capturingKitProject}" == *"Exoom"* || "${capturingKitProject}" == *"All_Exon_v1"* || "${capturingKitProject}" == *"wgs"* || "${capturingKitProject}" == *"WGS"* ]] then batching="_chr" + if [ ! -e "${coveragePerTargetDir}/${captKit}/${captKit}" ] + then + echo "Bedfile in ${coveragePerTargetDir} does not exist! Exiting" + exit 1 + fi +else + if [ ! -e "${coveragePerBaseDir}/${captKit}/${captKit}" ] + then + echo "Bedfile in ${coveragePerBaseDir} does not exist! Exiting" + exit 1 + fi fi + echo "BATCHIDLIST=${EBROOTNGS_DNA}/batchIDList${batching}.csv" sh "${EBROOTMOLGENISMINCOMPUTE}/molgenis_compute.sh" \