Skip to content

Commit

Permalink
Add some Variant Calling (#4)
Browse files Browse the repository at this point in the history
* Update allelecount to 4.0.2
* Update GATK to 4.1.2.0
* Update docs
* Add HaplotypeCaller
* Add single Strelka and single Manta
  • Loading branch information
maxulysse authored May 3, 2019
1 parent c46c460 commit c117cba
Show file tree
Hide file tree
Showing 5 changed files with 489 additions and 46 deletions.
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,5 @@ install:

script:
- git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data
- nextflow run ${TRAVIS_BUILD_DIR}/build.nf -profile docker --genome smallGRCh37 --refdir data/reference --outdir references --publishDirMode link --max_memory 7.GB --max_cpus 2
- nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile docker --genome smallGRCh37 --igenomes_base references --sampleDir data/testdata/tiny/normal --publishDirMode link --max_memory 7.GB --max_cpus 2
- nextflow run ${TRAVIS_BUILD_DIR}/build.nf -profile docker --genome smallGRCh37 --refdir data/reference --outdir references --publishDirMode link --max_memory 7.GB --max_cpus 2 -ansi-log false
- nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile docker --genome smallGRCh37 --sampleDir data/testdata/tiny/normal --tools HaplotypeCaller,Manta,Strelka --igenomes_base references --publishDirMode link --max_memory 7.GB --max_cpus 2 -ansi-log false
9 changes: 6 additions & 3 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,20 @@ pipeline {
stage('Build') {
steps {
sh "git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data"
sh "nextflow run build.nf -profile docker --genome smallGRCh37 --refdir data/reference --outdir references --publishDirMode link"
sh "nextflow run build.nf -profile docker --genome smallGRCh37 --refdir data/reference --outdir references --publishDirMode link -ansi-log false"
sh "rm -rf work/ references/pipeline_info .nextflow*"
}
}
stage('SampleDir') {
steps {
sh "nextflow run main.nf -profile docker --genome smallGRCh37 --igenomes_base references --sampleDir data/testdata/tiny/normal --publishDirMode link"
sh "nextflow run main.nf -profile docker --sampleDir data/testdata/tiny/normal --tools HaplotypeCaller,Manta,Strelka --genome smallGRCh37 --igenomes_base references --publishDirMode link -ansi-log false"
sh "rm -rf work/ .nextflow* results/"
}
}
stage('Multiple') {
steps {
sh "nextflow run main.nf -profile docker --genome smallGRCh37 --igenomes_base references --sample data/testdata/tsv/tiny-multiple.tsv --publishDirMode link"
sh "nextflow run main.nf -profile docker --sample data/testdata/tsv/tiny-multiple.tsv --tools HaplotypeCaller,Manta,Strelka --genome smallGRCh37 --igenomes_base references --publishDirMode link -ansi-log false"
sh "rm -rf work/ .nextflow* results/"
}
}
}
Expand Down
85 changes: 85 additions & 0 deletions bin/concatenateVCFs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/usr/bin/env bash
# this script concatenates all VCFs that are in the local directory: the
# purpose is to make a single VCF from all the VCFs that were created from different intervals

usage() { echo "Usage: $0 [-i genome_index_file] [-o output.file.no.gz.extension] <-t target.bed> <-c cpus>" 1>&2; exit 1; }

while getopts "i:c:o:t:" p; do
case "${p}" in
i)
genomeIndex=${OPTARG}
;;
c)
cpus=${OPTARG}
;;
o)
outputFile=${OPTARG}
;;
t)
targetBED=${OPTARG}
;;
*)
usage
;;
esac
done
shift $((OPTIND-1))

if [ -z ${genomeIndex} ]; then echo "Missing index file "; usage; fi
if [ -z ${cpus} ]; then echo "No CPUs defined: setting to 1"; cpus=1; fi
if [ -z ${outputFile} ]; then echo "Missing output file name"; usage; fi

set -euo pipefail

# first make a header from one of the VCF intervals
# get rid of interval information only from the GATK command-line, but leave the rest
FIRSTVCF=$(ls *.vcf | head -n 1)
sed -n '/^[^#]/q;p' $FIRSTVCF | \
awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if($i!~/intervals=/ && $i !~ /out=/){printf("%s ",$i)}}printf("\n")}' \
> header

# Get list of contigs from the FASTA index (.fai). We cannot use the ##contig
# header in the VCF as it is optional (FreeBayes does not save it, for example)
CONTIGS=($(cut -f1 ${genomeIndex}))

# concatenate VCFs in the correct order
(
cat header

for chr in "${CONTIGS[@]}"; do
# Skip if globbing would not match any file to avoid errors such as
# "ls: cannot access chr3_*.vcf: No such file or directory" when chr3
# was not processed.
pattern="${chr}_*.vcf"
if ! compgen -G "${pattern}" > /dev/null; then continue; fi

# ls -v sorts by numeric value ("version"), which means that chr1_100_
# is sorted *after* chr1_99_.
for vcf in $(ls -v ${pattern}); do
# Determine length of header.
# The 'q' command makes sed exit when it sees the first non-header
# line, which avoids reading in the entire file.
L=$(sed -n '/^[^#]/q;p' ${vcf} | wc -l)

# Then print all non-header lines. Since tail is very fast (nearly as
# fast as cat), this is way more efficient than using a single sed,
# awk or grep command.
tail -n +$((L+1)) ${vcf}
done
done
) | bgzip -@${cpus} > rawcalls.vcf.gz
tabix rawcalls.vcf.gz

set +u

# now we have the concatenated VCF file, check for WES/panel targets, and generate a subset if there is a BED provided
echo "target is $targetBED"
if [ ! -z ${targetBED+x} ]; then
echo "Selecting subset..."
bcftools isec --targets-file ${targetBED} rawcalls.vcf.gz | bgzip -@${cpus} > ${outputFile}.gz
tabix ${outputFile}.gz
else
# simply rename the raw calls as WGS results
for f in rawcalls*; do mv -v $f ${outputFile}${f#rawcalls.vcf}; done
fi

4 changes: 2 additions & 2 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ dependencies:
- bcftools=1.9
- bioconductor-rtracklayer=1.42.1
- bwa=0.7.17
- cancerit-allelecount=2.1.2
- cancerit-allelecount=4.0.2
- control-freec=11.4
- ensembl-vep=96.0
- fastqc=0.11.8
- freebayes=1.2.0
- gatk4=4.1.1.0
- gatk4=4.1.2.0
- genesplicer=1.0
- htslib=1.9
- igvtools=2.3.93
Expand Down
Loading

0 comments on commit c117cba

Please sign in to comment.